create corr/plots for all metrics, increase log readability, close matplotlib figures

peitek · peitek · commit 39b915ab3511 · 2020-12-23T10:19:49.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 .idea
 /venv
 __pycache__
+.pytest_cache
 
 # Project-specific
 /analysis/output
diff --git a/analysis/BehavioralSubjective.py b/analysis/BehavioralSubjective.py
@@ -54,10 +54,11 @@ def plot_correlation_correctness(df, metric):
     plt.xlabel(metric, color=color)
 
     corr = df[metric].corr(df['Correct'], method='kendall')
-    print('Kendall corr:', corr)
+    print('Metric: ' + metric + ' ~  Correctness')
+    print('-> Kendall corr:', corr)
 
     slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df[metric], df['ResponseTime'])
-    print('r squared:', r_value**2)
+    print('-> r squared:', r_value**2)
 
     left, right = plt.xlim()
     ax1.text(left+((right-left)/40), 14, 'Kendall τ: ' + format(corr, '.2f'), fontdict=graph_label)
@@ -67,6 +68,7 @@ def plot_correlation_correctness(df, metric):
     plt.tight_layout()
 
     plt.savefig(ROOT_DIR + '/analysis/output/' + metric + '_Correctness.pdf', dpi=300, bbox_inches='tight', pad_inches=0)
+    plt.close(plt.gcf())
 
 
 def plot_correlation_responsetime(df, metric):
@@ -87,10 +89,11 @@ def plot_correlation_responsetime(df, metric):
     plt.xlabel("")
 
     corr = df[metric].corr(df['ResponseTime'], method='kendall')
-    print('Kendall corr:', corr)
+    print('Metric: ' + metric + ' ~  ResponseTime')
+    print('-> Kendall corr:', corr)
 
     slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df[metric], df['ResponseTime'])
-    print('r squared:', r_value**2)
+    print('-> r squared:', r_value**2)
 
     left, right = plt.xlim()
     ax1.text(left+((right-left)/40), 8, 'Kendall τ: ' + format(corr, '.2f'), fontdict=graph_label)
@@ -100,6 +103,7 @@ def plot_correlation_responsetime(df, metric):
     plt.tight_layout()
 
     plt.savefig(ROOT_DIR + '/analysis/output/' + metric + '_ResponseTime.pdf', dpi=300, bbox_inches='tight', pad_inches=0)
+    plt.close(plt.gcf())
 
 
 def plot_correlation_subjcomplexity_metrics(df, metric):
@@ -120,10 +124,11 @@ def plot_correlation_subjcomplexity_metrics(df, metric):
     plt.xlabel(metric, color=color)
 
     corr = df[metric].corr(df['subj_complexity'], method='kendall')
-    print('Kendall corr:', corr)
+    print('Metric: ' + metric + ' ~ SubjComplexity')
+    print('-> Kendall corr:', corr)
 
     slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df[metric], df['subj_complexity'])
-    print('r squared:', r_value**2)
+    print('-> r squared:', r_value**2)
 
     left, right = plt.xlim()
     ax1.text(left+((right-left)/40), 10, 'Kendall τ: ' + format(corr, '.2f'), fontdict=graph_label)
@@ -133,6 +138,7 @@ def plot_correlation_subjcomplexity_metrics(df, metric):
     plt.tight_layout()
 
     plt.savefig(ROOT_DIR + '/analysis/output/SubjComplexity_' + metric + '.pdf', dpi=300, bbox_inches='tight', pad_inches=0)
+    plt.close(plt.gcf())
 
 
 def plot_correlation_subjcomplexity_responsetime(df):
@@ -145,10 +151,11 @@ def plot_correlation_subjcomplexity_responsetime(df):
     plt.ylim((0, 61))
 
     corr = df['subj_complexity'].corr(df['ResponseTime'], method='kendall')
-    print('Kendall corr:', corr)
+    print('SubjComplexity ~ ResponseTime')
+    print('-> Kendall corr:', corr)
 
     slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df['subj_complexity'], df['ResponseTime'])
-    print('r squared:', r_value**2)
+    print('-> r squared:', r_value**2)
 
     plt.ylabel("Response Time in sec.")
     plt.xlabel("Subjective Complexity Rating")
@@ -161,6 +168,7 @@ def plot_correlation_subjcomplexity_responsetime(df):
     plt.tight_layout()
 
     plt.savefig(ROOT_DIR + '/analysis/output/SubjComplexity_ResponseTime.pdf', dpi=300, bbox_inches='tight', pad_inches=0)
+    plt.close(plt.gcf())
 
 
 def plot_correlation_subjcomplexity_correctness(df):
@@ -173,10 +181,11 @@ def plot_correlation_subjcomplexity_correctness(df):
     plt.ylim((0, 100))
 
     corr = df['subj_complexity'].corr(df['Correct'], method='kendall')
-    print('Kendall corr:', corr)
+    print('SubjComplexity ~ Correctness')
+    print('-> Kendall corr:', corr)
 
     slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df['subj_complexity'], df['Correct'])
-    print('r squared:', r_value ** 2)
+    print('-> r squared:', r_value ** 2)
 
     plt.ylabel("Correct Responses in %")
     plt.xlabel("Subjective Complexity Rating")
@@ -189,6 +198,7 @@ def plot_correlation_subjcomplexity_correctness(df):
     plt.tight_layout()
 
     plt.savefig(ROOT_DIR + '/analysis/output/SubjComplexity_Correctness.pdf', dpi=300, bbox_inches='tight', pad_inches=0)
+    plt.close(plt.gcf())
 
 
 def select_color_for_metric(metric):
@@ -231,16 +241,13 @@ def main():
     snippet_correctness = snippet_behavioral.groupby('Snippet').mean()
     snippet_correctness["Correct"] = snippet_correctness['Correct'].apply(convert_to_percent)
 
-    # create plots for metrics ~ response time
-    plot_correlation_responsetime(snippet_behavioral, "LOC")
-    plot_correlation_responsetime(snippet_behavioral, "DepDegree")
-    plot_correlation_responsetime(snippet_behavioral, "McCabe")
-    plot_correlation_responsetime(snippet_behavioral, "Halstead")
+    # create plots for metrics ~ response time & correctness
+    metrics = ["LOC", "DepDegree", "McCabe", "Halstead"]  # for a small run with the four main representatives
+    metrics = list(snippet_metrics)[2:]  # for a full run
 
-    plot_correlation_correctness(snippet_correctness, "LOC")
-    plot_correlation_correctness(snippet_correctness, "DepDegree")
-    plot_correlation_correctness(snippet_correctness, "McCabe")
-    plot_correlation_correctness(snippet_correctness, "Halstead")
+    for metric in metrics:
+        plot_correlation_responsetime(snippet_behavioral, metric)
+        plot_correlation_correctness(snippet_correctness, metric)
 
     # correlate with behavioral data
     print('\n##### \n correlating subjective complexity with behavioral data')
diff --git a/analysis/BrainActivationAnalysis.py b/analysis/BrainActivationAnalysis.py
@@ -34,6 +34,7 @@ def plot_ba_subj_rating(df, ba, activation=True, participant=None):
     plt.xlabel("Subjective Complexity Rating")
 
     corr = df['subj_complexity'].corr(df[ba], method='kendall')
+    print('subj_complexity: ~  BA: ' + ba)
     print('Kendall corr:', corr)
 
     slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df['subj_complexity'], df[ba])
@@ -56,6 +57,7 @@ def plot_ba_subj_rating(df, ba, activation=True, participant=None):
         prefix += participant + '_'
 
     plt.savefig(prefix + ba + '.pdf', dpi=300, bbox_inches='tight', pad_inches=0)
+    plt.clf()
 
 
 def plot_ba_for_metric(df, metric, ba, activation=True):
@@ -89,10 +91,11 @@ def plot_ba_for_metric(df, metric, ba, activation=True):
         plt.xlabel("")
 
     corr = df[metric].corr(df[ba], method='kendall')
-    print('Kendall corr:', corr)
+    print('Metric: ' + metric + ' ~  BA: ' + ba)
+    print('-> Kendall corr:', corr)
 
     slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df[metric], df[ba])
-    print('r squared:', r_value ** 2)
+    print('-> r squared:', r_value ** 2)
 
     axes = plt.gca()
     if activation:
@@ -118,6 +121,8 @@ def plot_ba_for_metric(df, metric, ba, activation=True):
         prefix = ROOT_DIR + '/analysis/output/deactivation_'
 
     plt.savefig(prefix + metric + '_' + ba + '.pdf', dpi=300, bbox_inches='tight', pad_inches=0)
+    plt.clf()
+    plt.close(fig)
 
 
 def get_bas(activation):
@@ -130,15 +135,16 @@ def get_bas(activation):
     return bas
 
 
-def create_plots(df, activation=True):
+def create_plots(df, snippet_metrics, activation=True):
     bas = get_bas(activation)
 
     # plot the stats
     for ba in bas:
-        plot_ba_for_metric(df, 'LOC', ba, activation)
-        plot_ba_for_metric(df, 'DepDegree', ba, activation)
-        plot_ba_for_metric(df, 'McCabe', ba, activation)
-        plot_ba_for_metric(df, 'Halstead', ba, activation)
+        metrics = ["LOC", "DepDegree", "McCabe", "Halstead"]  # for a small run with the four main representatives
+        metrics = list(snippet_metrics)[2:]  # for a full run
+
+        for metric in metrics:
+            plot_ba_for_metric(df, metric, ba, activation)
 
 
 def compute_statistics(df, activation=True):
@@ -183,8 +189,8 @@ def main():
     df_ba_cond_act = pd.merge(df_ba_cond_act, snippet_metrics, how='left', left_on=['condition'], right_on=['Snippet'])
     df_ba_cond_deact = pd.merge(df_ba_cond_deact, snippet_metrics, how='left', left_on=['condition'], right_on=['Snippet'])
 
-    create_plots(df_ba_cond_act, True)
-    create_plots(df_ba_cond_deact, False)
+    create_plots(df_ba_cond_act, snippet_metrics, True)
+    create_plots(df_ba_cond_deact, snippet_metrics, False)
 
     compute_statistics(df_ba_cond_act, True)
     compute_statistics(df_ba_cond_deact, False)
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
@@ -28,7 +28,7 @@ def test_behavioral_subjective(self):
 
         BehavioralSubjective.main()
         file_count_end = get_number_of_files_in_output()
-        self.assertEqual(15, file_count_end)  # pipeline should create 14 files + .gitkeep
+        self.assertEqual(75, file_count_end)  # pipeline should create 14 files + .gitkeep
 
     def test_behavioral_brain(self):
         empty_output_dir()
@@ -48,7 +48,7 @@ def test_brain_activation(self):
 
         BrainActivationAnalysis.main()
         file_count_end = get_number_of_files_in_output()
-        self.assertEqual(31, file_count_end)  # pipeline should create 30 files + .gitkeep
+        self.assertEqual(211, file_count_end)  # pipeline should create 30 files + .gitkeep
 
 
 if __name__ == '__main__':