1 files changed, 255 insertions, 0 deletions
diff --git a/visualizer.py b/visualizer.py
new file mode 100644
index 0000000..6608037
--- /dev/null
+++ b/visualizer.py
@@ -0,0 +1,255 @@
+"""
+Visualization module for code metrics analysis results.
+"""
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from typing import Dict, List, Optional
+import os
+
+
+class Visualizer:
+    """Creates visualizations for code metrics analysis."""
+    
+    def __init__(self, output_dir: str = 'figures'):
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(exist_ok=True)
+        sns.set_style("whitegrid")
+        plt.rcParams['figure.figsize'] = (12, 8)
+    
+    def plot_correlation_heatmap(self, df: pd.DataFrame, 
+                                 complexity_cols: List[str],
+                                 issue_cols: List[str],
+                                 filename: str = 'correlation_heatmap.png'):
+        """Create correlation heatmap between complexity and issues."""
+        # Filter to existing columns
+        comp_cols = [col for col in complexity_cols if col in df.columns]
+        iss_cols = [col for col in issue_cols if col in df.columns]
+        
+        if not comp_cols or not iss_cols:
+            return
+        
+        # Calculate correlation matrix
+        corr_matrix = np.full((len(comp_cols), len(iss_cols)), np.nan)
+        
+        for i, comp_col in enumerate(comp_cols):
+            for j, iss_col in enumerate(iss_cols):
+                mask = df[[comp_col, iss_col]].notna().all(axis=1)
+                if mask.sum() >= 3:
+                    corr = df.loc[mask, comp_col].corr(df.loc[mask, iss_col], method='pearson')
+                    if not np.isnan(corr):
+                        corr_matrix[i, j] = corr
+        
+        # Check if we have any valid correlations
+        if np.isnan(corr_matrix).all():
+            return
+        
+        # Create heatmap
+        fig, ax = plt.subplots(figsize=(14, 10))
+        sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', 
+                   center=0, vmin=-1, vmax=1, ax=ax,
+                   xticklabels=iss_cols, yticklabels=comp_cols,
+                   mask=np.isnan(corr_matrix))
+        ax.set_title('Correlation Heatmap: Complexity Metrics vs Issue Metrics', 
+                    fontsize=16, fontweight='bold')
+        plt.tight_layout()
+        plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight')
+        plt.close()
+    
+    def plot_complexity_vs_issues_scatter(self, df: pd.DataFrame,
+                                          complexity_col: str,
+                                          issue_col: str,
+                                          filename: Optional[str] = None):
+        """Create scatter plot of complexity vs issues."""
+        if complexity_col not in df.columns or issue_col not in df.columns:
+            return
+        
+        mask = df[[complexity_col, issue_col]].notna().all(axis=1)
+        if mask.sum() < 3:
+            return
+        
+        fig, ax = plt.subplots(figsize=(10, 6))
+        
+        x = df.loc[mask, complexity_col]
+        y = df.loc[mask, issue_col]
+        
+        # Remove any infinite or NaN values
+        valid_mask = np.isfinite(x) & np.isfinite(y)
+        x = x[valid_mask]
+        y = y[valid_mask]
+        
+        if len(x) < 2:
+            return
+        
+        ax.scatter(x, y, alpha=0.5, s=50)
+        
+        # Add regression line (with error handling)
+        try:
+            # Check if x has variance (not constant)
+            if x.std() > 1e-10:
+                z = np.polyfit(x, y, 1)
+                p = np.poly1d(z)
+                x_sorted = np.sort(x)
+                ax.plot(x_sorted, p(x_sorted), "r--", alpha=0.8, linewidth=2, 
+                       label=f'Trend line (slope={z[0]:.3f})')
+        except (np.linalg.LinAlgError, ValueError, RuntimeError):
+            # Skip regression line if fitting fails
+            pass
+        
+        ax.set_xlabel(complexity_col.replace('_', ' ').title(), fontsize=12)
+        ax.set_ylabel(issue_col.replace('_', ' ').title(), fontsize=12)
+        ax.set_title(f'{complexity_col.replace("_", " ").title()} vs '
+                    f'{issue_col.replace("_", " ").title()}', 
+                    fontsize=14, fontweight='bold')
+        ax.legend()
+        ax.grid(True, alpha=0.3)
+        
+        plt.tight_layout()
+        if filename:
+            plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight')
+        plt.close()
+    
+    def plot_module_complexity_comparison(self, df: pd.DataFrame,
+                                         complexity_col: str,
+                                         filename: str = 'module_complexity_comparison.png'):
+        """Compare complexity across different modules."""
+        if 'module' not in df.columns or complexity_col not in df.columns:
+            return
+        
+        mask = df[complexity_col].notna()
+        if mask.sum() == 0:
+            return
+        
+        fig, axes = plt.subplots(2, 1, figsize=(14, 10))
+        
+        # Box plot
+        modules = df.loc[mask, 'module'].unique()
+        data_by_module = [df.loc[mask & (df['module'] == mod), complexity_col].values 
+                         for mod in modules]
+        
+        axes[0].boxplot(data_by_module, labels=modules)
+        axes[0].set_ylabel(complexity_col.replace('_', ' ').title(), fontsize=12)
+        axes[0].set_title(f'{complexity_col.replace("_", " ").title()} by Module', 
+                         fontsize=14, fontweight='bold')
+        axes[0].tick_params(axis='x', rotation=45)
+        axes[0].grid(True, alpha=0.3)
+        
+        # Bar plot of means
+        module_means = df.loc[mask].groupby('module')[complexity_col].mean().sort_values(ascending=False)
+        axes[1].bar(range(len(module_means)), module_means.values)
+        axes[1].set_xticks(range(len(module_means)))
+        axes[1].set_xticklabels(module_means.index, rotation=45, ha='right')
+        axes[1].set_ylabel(f'Mean {complexity_col.replace("_", " ").title()}', fontsize=12)
+        axes[1].set_title(f'Average {complexity_col.replace("_", " ").title()} by Module', 
+                         fontsize=14, fontweight='bold')
+        axes[1].grid(True, alpha=0.3, axis='y')
+        
+        plt.tight_layout()
+        plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight')
+        plt.close()
+    
+    def plot_distribution_analysis(self, df: pd.DataFrame,
+                                  metric_col: str,
+                                  filename: str = 'distribution_analysis.png'):
+        """Plot distribution analysis for a metric."""
+        if metric_col not in df.columns:
+            return
+        
+        data = df[metric_col].dropna()
+        if len(data) == 0:
+            return
+        
+        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+        
+        # Histogram
+        axes[0, 0].hist(data, bins=30, edgecolor='black', alpha=0.7)
+        axes[0, 0].set_xlabel(metric_col.replace('_', ' ').title(), fontsize=12)
+        axes[0, 0].set_ylabel('Frequency', fontsize=12)
+        axes[0, 0].set_title('Histogram', fontsize=12, fontweight='bold')
+        axes[0, 0].grid(True, alpha=0.3)
+        
+        # Q-Q plot
+        from scipy import stats
+        stats.probplot(data, dist="norm", plot=axes[0, 1])
+        axes[0, 1].set_title('Q-Q Plot (Normal Distribution)', fontsize=12, fontweight='bold')
+        axes[0, 1].grid(True, alpha=0.3)
+        
+        # Box plot
+        axes[1, 0].boxplot(data, vert=True)
+        axes[1, 0].set_ylabel(metric_col.replace('_', ' ').title(), fontsize=12)
+        axes[1, 0].set_title('Box Plot', fontsize=12, fontweight='bold')
+        axes[1, 0].grid(True, alpha=0.3)
+        
+        # Cumulative distribution
+        sorted_data = np.sort(data)
+        cumulative = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
+        axes[1, 1].plot(sorted_data, cumulative, linewidth=2)
+        axes[1, 1].set_xlabel(metric_col.replace('_', ' ').title(), fontsize=12)
+        axes[1, 1].set_ylabel('Cumulative Probability', fontsize=12)
+        axes[1, 1].set_title('Cumulative Distribution Function', fontsize=12, fontweight='bold')
+        axes[1, 1].grid(True, alpha=0.3)
+        
+        plt.suptitle(f'Distribution Analysis: {metric_col.replace("_", " ").title()}', 
+                    fontsize=16, fontweight='bold')
+        plt.tight_layout()
+        plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight')
+        plt.close()
+    
+    def plot_regression_results(self, regression_results: Dict,
+                               filename: str = 'regression_results.png'):
+        """Visualize regression analysis results."""
+        if 'coefficients' not in regression_results:
+            return
+        
+        coefficients = regression_results['coefficients']
+        p_values = regression_results.get('p_values', {})
+        ci = regression_results.get('confidence_intervals', {})
+        
+        if not coefficients:
+            return
+        
+        fig, axes = plt.subplots(1, 2, figsize=(14, 6))
+        
+        # Coefficient plot with confidence intervals
+        features = list(coefficients.keys())
+        coef_values = list(coefficients.values())
+        colors = ['red' if p_values.get(f, 1) < 0.05 else 'gray' 
+                 for f in features]
+        
+        y_pos = np.arange(len(features))
+        axes[0].barh(y_pos, coef_values, color=colors, alpha=0.7)
+        
+        # Add confidence intervals
+        for i, feature in enumerate(features):
+            if feature in ci:
+                ci_lower, ci_upper = ci[feature]
+                axes[0].plot([ci_lower, ci_upper], [i, i], 'k-', linewidth=2)
+        
+        axes[0].set_yticks(y_pos)
+        axes[0].set_yticklabels(features)
+        axes[0].set_xlabel('Coefficient Value', fontsize=12)
+        axes[0].set_title('Regression Coefficients with 95% CI', fontsize=14, fontweight='bold')
+        axes[0].axvline(x=0, color='black', linestyle='--', linewidth=1)
+        axes[0].grid(True, alpha=0.3)
+        
+        # P-values plot
+        p_vals = [p_values.get(f, 1) for f in features]
+        axes[1].barh(y_pos, p_vals, color=colors, alpha=0.7)
+        axes[1].axvline(x=0.05, color='red', linestyle='--', linewidth=2, 
+                       label='Significance Level (0.05)')
+        axes[1].set_yticks(y_pos)
+        axes[1].set_yticklabels(features)
+        axes[1].set_xlabel('P-value', fontsize=12)
+        axes[1].set_title('P-values for Coefficients', fontsize=14, fontweight='bold')
+        axes[1].set_xlim([0, max(p_vals) * 1.1])
+        axes[1].legend()
+        axes[1].grid(True, alpha=0.3)
+        
+        plt.suptitle(f'Regression Analysis Results (R² = {regression_results.get("r_squared", 0):.3f})', 
+                    fontsize=16, fontweight='bold')
+        plt.tight_layout()
+        plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight')
+        plt.close()
+