aboutsummaryrefslogtreecommitdiff
path: root/visualizer.py
diff options
context:
space:
mode:
Diffstat (limited to 'visualizer.py')
-rw-r--r--visualizer.py255
1 files changed, 255 insertions, 0 deletions
diff --git a/visualizer.py b/visualizer.py
new file mode 100644
index 0000000..6608037
--- /dev/null
+++ b/visualizer.py
@@ -0,0 +1,255 @@
+"""
+Visualization module for code metrics analysis results.
+"""
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from typing import Dict, List, Optional
+import os
+
+
+class Visualizer:
+ """Creates visualizations for code metrics analysis."""
+
+ def __init__(self, output_dir: str = 'figures'):
+ self.output_dir = Path(output_dir)
+ self.output_dir.mkdir(exist_ok=True)
+ sns.set_style("whitegrid")
+ plt.rcParams['figure.figsize'] = (12, 8)
+
+ def plot_correlation_heatmap(self, df: pd.DataFrame,
+ complexity_cols: List[str],
+ issue_cols: List[str],
+ filename: str = 'correlation_heatmap.png'):
+ """Create correlation heatmap between complexity and issues."""
+ # Filter to existing columns
+ comp_cols = [col for col in complexity_cols if col in df.columns]
+ iss_cols = [col for col in issue_cols if col in df.columns]
+
+ if not comp_cols or not iss_cols:
+ return
+
+ # Calculate correlation matrix
+ corr_matrix = np.full((len(comp_cols), len(iss_cols)), np.nan)
+
+ for i, comp_col in enumerate(comp_cols):
+ for j, iss_col in enumerate(iss_cols):
+ mask = df[[comp_col, iss_col]].notna().all(axis=1)
+ if mask.sum() >= 3:
+ corr = df.loc[mask, comp_col].corr(df.loc[mask, iss_col], method='pearson')
+ if not np.isnan(corr):
+ corr_matrix[i, j] = corr
+
+ # Check if we have any valid correlations
+ if np.isnan(corr_matrix).all():
+ return
+
+ # Create heatmap
+ fig, ax = plt.subplots(figsize=(14, 10))
+ sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm',
+ center=0, vmin=-1, vmax=1, ax=ax,
+ xticklabels=iss_cols, yticklabels=comp_cols,
+ mask=np.isnan(corr_matrix))
+ ax.set_title('Correlation Heatmap: Complexity Metrics vs Issue Metrics',
+ fontsize=16, fontweight='bold')
+ plt.tight_layout()
+ plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight')
+ plt.close()
+
+ def plot_complexity_vs_issues_scatter(self, df: pd.DataFrame,
+ complexity_col: str,
+ issue_col: str,
+ filename: Optional[str] = None):
+ """Create scatter plot of complexity vs issues."""
+ if complexity_col not in df.columns or issue_col not in df.columns:
+ return
+
+ mask = df[[complexity_col, issue_col]].notna().all(axis=1)
+ if mask.sum() < 3:
+ return
+
+ fig, ax = plt.subplots(figsize=(10, 6))
+
+ x = df.loc[mask, complexity_col]
+ y = df.loc[mask, issue_col]
+
+ # Remove any infinite or NaN values
+ valid_mask = np.isfinite(x) & np.isfinite(y)
+ x = x[valid_mask]
+ y = y[valid_mask]
+
+ if len(x) < 2:
+ return
+
+ ax.scatter(x, y, alpha=0.5, s=50)
+
+ # Add regression line (with error handling)
+ try:
+ # Check if x has variance (not constant)
+ if x.std() > 1e-10:
+ z = np.polyfit(x, y, 1)
+ p = np.poly1d(z)
+ x_sorted = np.sort(x)
+ ax.plot(x_sorted, p(x_sorted), "r--", alpha=0.8, linewidth=2,
+ label=f'Trend line (slope={z[0]:.3f})')
+ except (np.linalg.LinAlgError, ValueError, RuntimeError):
+ # Skip regression line if fitting fails
+ pass
+
+ ax.set_xlabel(complexity_col.replace('_', ' ').title(), fontsize=12)
+ ax.set_ylabel(issue_col.replace('_', ' ').title(), fontsize=12)
+ ax.set_title(f'{complexity_col.replace("_", " ").title()} vs '
+ f'{issue_col.replace("_", " ").title()}',
+ fontsize=14, fontweight='bold')
+ ax.legend()
+ ax.grid(True, alpha=0.3)
+
+ plt.tight_layout()
+ if filename:
+ plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight')
+ plt.close()
+
+ def plot_module_complexity_comparison(self, df: pd.DataFrame,
+ complexity_col: str,
+ filename: str = 'module_complexity_comparison.png'):
+ """Compare complexity across different modules."""
+ if 'module' not in df.columns or complexity_col not in df.columns:
+ return
+
+ mask = df[complexity_col].notna()
+ if mask.sum() == 0:
+ return
+
+ fig, axes = plt.subplots(2, 1, figsize=(14, 10))
+
+ # Box plot
+ modules = df.loc[mask, 'module'].unique()
+ data_by_module = [df.loc[mask & (df['module'] == mod), complexity_col].values
+ for mod in modules]
+
+ axes[0].boxplot(data_by_module, labels=modules)
+ axes[0].set_ylabel(complexity_col.replace('_', ' ').title(), fontsize=12)
+ axes[0].set_title(f'{complexity_col.replace("_", " ").title()} by Module',
+ fontsize=14, fontweight='bold')
+ axes[0].tick_params(axis='x', rotation=45)
+ axes[0].grid(True, alpha=0.3)
+
+ # Bar plot of means
+ module_means = df.loc[mask].groupby('module')[complexity_col].mean().sort_values(ascending=False)
+ axes[1].bar(range(len(module_means)), module_means.values)
+ axes[1].set_xticks(range(len(module_means)))
+ axes[1].set_xticklabels(module_means.index, rotation=45, ha='right')
+ axes[1].set_ylabel(f'Mean {complexity_col.replace("_", " ").title()}', fontsize=12)
+ axes[1].set_title(f'Average {complexity_col.replace("_", " ").title()} by Module',
+ fontsize=14, fontweight='bold')
+ axes[1].grid(True, alpha=0.3, axis='y')
+
+ plt.tight_layout()
+ plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight')
+ plt.close()
+
+ def plot_distribution_analysis(self, df: pd.DataFrame,
+ metric_col: str,
+ filename: str = 'distribution_analysis.png'):
+ """Plot distribution analysis for a metric."""
+ if metric_col not in df.columns:
+ return
+
+ data = df[metric_col].dropna()
+ if len(data) == 0:
+ return
+
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+
+ # Histogram
+ axes[0, 0].hist(data, bins=30, edgecolor='black', alpha=0.7)
+ axes[0, 0].set_xlabel(metric_col.replace('_', ' ').title(), fontsize=12)
+ axes[0, 0].set_ylabel('Frequency', fontsize=12)
+ axes[0, 0].set_title('Histogram', fontsize=12, fontweight='bold')
+ axes[0, 0].grid(True, alpha=0.3)
+
+ # Q-Q plot
+ from scipy import stats
+ stats.probplot(data, dist="norm", plot=axes[0, 1])
+ axes[0, 1].set_title('Q-Q Plot (Normal Distribution)', fontsize=12, fontweight='bold')
+ axes[0, 1].grid(True, alpha=0.3)
+
+ # Box plot
+ axes[1, 0].boxplot(data, vert=True)
+ axes[1, 0].set_ylabel(metric_col.replace('_', ' ').title(), fontsize=12)
+ axes[1, 0].set_title('Box Plot', fontsize=12, fontweight='bold')
+ axes[1, 0].grid(True, alpha=0.3)
+
+ # Cumulative distribution
+ sorted_data = np.sort(data)
+ cumulative = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
+ axes[1, 1].plot(sorted_data, cumulative, linewidth=2)
+ axes[1, 1].set_xlabel(metric_col.replace('_', ' ').title(), fontsize=12)
+ axes[1, 1].set_ylabel('Cumulative Probability', fontsize=12)
+ axes[1, 1].set_title('Cumulative Distribution Function', fontsize=12, fontweight='bold')
+ axes[1, 1].grid(True, alpha=0.3)
+
+ plt.suptitle(f'Distribution Analysis: {metric_col.replace("_", " ").title()}',
+ fontsize=16, fontweight='bold')
+ plt.tight_layout()
+ plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight')
+ plt.close()
+
+ def plot_regression_results(self, regression_results: Dict,
+ filename: str = 'regression_results.png'):
+ """Visualize regression analysis results."""
+ if 'coefficients' not in regression_results:
+ return
+
+ coefficients = regression_results['coefficients']
+ p_values = regression_results.get('p_values', {})
+ ci = regression_results.get('confidence_intervals', {})
+
+ if not coefficients:
+ return
+
+ fig, axes = plt.subplots(1, 2, figsize=(14, 6))
+
+ # Coefficient plot with confidence intervals
+ features = list(coefficients.keys())
+ coef_values = list(coefficients.values())
+ colors = ['red' if p_values.get(f, 1) < 0.05 else 'gray'
+ for f in features]
+
+ y_pos = np.arange(len(features))
+ axes[0].barh(y_pos, coef_values, color=colors, alpha=0.7)
+
+ # Add confidence intervals
+ for i, feature in enumerate(features):
+ if feature in ci:
+ ci_lower, ci_upper = ci[feature]
+ axes[0].plot([ci_lower, ci_upper], [i, i], 'k-', linewidth=2)
+
+ axes[0].set_yticks(y_pos)
+ axes[0].set_yticklabels(features)
+ axes[0].set_xlabel('Coefficient Value', fontsize=12)
+ axes[0].set_title('Regression Coefficients with 95% CI', fontsize=14, fontweight='bold')
+ axes[0].axvline(x=0, color='black', linestyle='--', linewidth=1)
+ axes[0].grid(True, alpha=0.3)
+
+ # P-values plot
+ p_vals = [p_values.get(f, 1) for f in features]
+ axes[1].barh(y_pos, p_vals, color=colors, alpha=0.7)
+ axes[1].axvline(x=0.05, color='red', linestyle='--', linewidth=2,
+ label='Significance Level (0.05)')
+ axes[1].set_yticks(y_pos)
+ axes[1].set_yticklabels(features)
+ axes[1].set_xlabel('P-value', fontsize=12)
+ axes[1].set_title('P-values for Coefficients', fontsize=14, fontweight='bold')
+ axes[1].set_xlim([0, max(p_vals) * 1.1])
+ axes[1].legend()
+ axes[1].grid(True, alpha=0.3)
+
+ plt.suptitle(f'Regression Analysis Results (R² = {regression_results.get("r_squared", 0):.3f})',
+ fontsize=16, fontweight='bold')
+ plt.tight_layout()
+ plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight')
+ plt.close()
+