""" Visualization module for code metrics analysis results. """ import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import numpy as np from pathlib import Path from typing import Dict, List, Optional import os class Visualizer: """Creates visualizations for code metrics analysis.""" def __init__(self, output_dir: str = 'figures'): self.output_dir = Path(output_dir) self.output_dir.mkdir(exist_ok=True) sns.set_style("whitegrid") plt.rcParams['figure.figsize'] = (12, 8) def plot_correlation_heatmap(self, df: pd.DataFrame, complexity_cols: List[str], issue_cols: List[str], filename: str = 'correlation_heatmap.png'): """Create correlation heatmap between complexity and issues.""" # Filter to existing columns comp_cols = [col for col in complexity_cols if col in df.columns] iss_cols = [col for col in issue_cols if col in df.columns] if not comp_cols or not iss_cols: return # Calculate correlation matrix corr_matrix = np.full((len(comp_cols), len(iss_cols)), np.nan) for i, comp_col in enumerate(comp_cols): for j, iss_col in enumerate(iss_cols): mask = df[[comp_col, iss_col]].notna().all(axis=1) if mask.sum() >= 3: corr = df.loc[mask, comp_col].corr(df.loc[mask, iss_col], method='pearson') if not np.isnan(corr): corr_matrix[i, j] = corr # Check if we have any valid correlations if np.isnan(corr_matrix).all(): return # Create heatmap fig, ax = plt.subplots(figsize=(14, 10)) sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0, vmin=-1, vmax=1, ax=ax, xticklabels=iss_cols, yticklabels=comp_cols, mask=np.isnan(corr_matrix)) ax.set_title('Correlation Heatmap: Complexity Metrics vs Issue Metrics', fontsize=16, fontweight='bold') plt.tight_layout() plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight') plt.close() def plot_complexity_vs_issues_scatter(self, df: pd.DataFrame, complexity_col: str, issue_col: str, filename: Optional[str] = None): """Create scatter plot of complexity vs issues.""" if complexity_col not in df.columns or issue_col not in df.columns: return mask = df[[complexity_col, issue_col]].notna().all(axis=1) if mask.sum() < 3: return fig, ax = plt.subplots(figsize=(10, 6)) x = df.loc[mask, complexity_col] y = df.loc[mask, issue_col] # Remove any infinite or NaN values valid_mask = np.isfinite(x) & np.isfinite(y) x = x[valid_mask] y = y[valid_mask] if len(x) < 2: return ax.scatter(x, y, alpha=0.5, s=50) # Add regression line (with error handling) try: # Check if x has variance (not constant) if x.std() > 1e-10: z = np.polyfit(x, y, 1) p = np.poly1d(z) x_sorted = np.sort(x) ax.plot(x_sorted, p(x_sorted), "r--", alpha=0.8, linewidth=2, label=f'Trend line (slope={z[0]:.3f})') except (np.linalg.LinAlgError, ValueError, RuntimeError): # Skip regression line if fitting fails pass ax.set_xlabel(complexity_col.replace('_', ' ').title(), fontsize=12) ax.set_ylabel(issue_col.replace('_', ' ').title(), fontsize=12) ax.set_title(f'{complexity_col.replace("_", " ").title()} vs ' f'{issue_col.replace("_", " ").title()}', fontsize=14, fontweight='bold') ax.legend() ax.grid(True, alpha=0.3) plt.tight_layout() if filename: plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight') plt.close() def plot_module_complexity_comparison(self, df: pd.DataFrame, complexity_col: str, filename: str = 'module_complexity_comparison.png'): """Compare complexity across different modules.""" if 'module' not in df.columns or complexity_col not in df.columns: return mask = df[complexity_col].notna() if mask.sum() == 0: return fig, axes = plt.subplots(2, 1, figsize=(14, 10)) # Box plot modules = df.loc[mask, 'module'].unique() data_by_module = [df.loc[mask & (df['module'] == mod), complexity_col].values for mod in modules] axes[0].boxplot(data_by_module, labels=modules) axes[0].set_ylabel(complexity_col.replace('_', ' ').title(), fontsize=12) axes[0].set_title(f'{complexity_col.replace("_", " ").title()} by Module', fontsize=14, fontweight='bold') axes[0].tick_params(axis='x', rotation=45) axes[0].grid(True, alpha=0.3) # Bar plot of means module_means = df.loc[mask].groupby('module')[complexity_col].mean().sort_values(ascending=False) axes[1].bar(range(len(module_means)), module_means.values) axes[1].set_xticks(range(len(module_means))) axes[1].set_xticklabels(module_means.index, rotation=45, ha='right') axes[1].set_ylabel(f'Mean {complexity_col.replace("_", " ").title()}', fontsize=12) axes[1].set_title(f'Average {complexity_col.replace("_", " ").title()} by Module', fontsize=14, fontweight='bold') axes[1].grid(True, alpha=0.3, axis='y') plt.tight_layout() plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight') plt.close() def plot_distribution_analysis(self, df: pd.DataFrame, metric_col: str, filename: str = 'distribution_analysis.png'): """Plot distribution analysis for a metric.""" if metric_col not in df.columns: return data = df[metric_col].dropna() if len(data) == 0: return fig, axes = plt.subplots(2, 2, figsize=(14, 10)) # Histogram axes[0, 0].hist(data, bins=30, edgecolor='black', alpha=0.7) axes[0, 0].set_xlabel(metric_col.replace('_', ' ').title(), fontsize=12) axes[0, 0].set_ylabel('Frequency', fontsize=12) axes[0, 0].set_title('Histogram', fontsize=12, fontweight='bold') axes[0, 0].grid(True, alpha=0.3) # Q-Q plot from scipy import stats stats.probplot(data, dist="norm", plot=axes[0, 1]) axes[0, 1].set_title('Q-Q Plot (Normal Distribution)', fontsize=12, fontweight='bold') axes[0, 1].grid(True, alpha=0.3) # Box plot axes[1, 0].boxplot(data, vert=True) axes[1, 0].set_ylabel(metric_col.replace('_', ' ').title(), fontsize=12) axes[1, 0].set_title('Box Plot', fontsize=12, fontweight='bold') axes[1, 0].grid(True, alpha=0.3) # Cumulative distribution sorted_data = np.sort(data) cumulative = np.arange(1, len(sorted_data) + 1) / len(sorted_data) axes[1, 1].plot(sorted_data, cumulative, linewidth=2) axes[1, 1].set_xlabel(metric_col.replace('_', ' ').title(), fontsize=12) axes[1, 1].set_ylabel('Cumulative Probability', fontsize=12) axes[1, 1].set_title('Cumulative Distribution Function', fontsize=12, fontweight='bold') axes[1, 1].grid(True, alpha=0.3) plt.suptitle(f'Distribution Analysis: {metric_col.replace("_", " ").title()}', fontsize=16, fontweight='bold') plt.tight_layout() plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight') plt.close() def plot_regression_results(self, regression_results: Dict, filename: str = 'regression_results.png'): """Visualize regression analysis results.""" if 'coefficients' not in regression_results: return coefficients = regression_results['coefficients'] p_values = regression_results.get('p_values', {}) ci = regression_results.get('confidence_intervals', {}) if not coefficients: return fig, axes = plt.subplots(1, 2, figsize=(14, 6)) # Coefficient plot with confidence intervals features = list(coefficients.keys()) coef_values = list(coefficients.values()) colors = ['red' if p_values.get(f, 1) < 0.05 else 'gray' for f in features] y_pos = np.arange(len(features)) axes[0].barh(y_pos, coef_values, color=colors, alpha=0.7) # Add confidence intervals for i, feature in enumerate(features): if feature in ci: ci_lower, ci_upper = ci[feature] axes[0].plot([ci_lower, ci_upper], [i, i], 'k-', linewidth=2) axes[0].set_yticks(y_pos) axes[0].set_yticklabels(features) axes[0].set_xlabel('Coefficient Value', fontsize=12) axes[0].set_title('Regression Coefficients with 95% CI', fontsize=14, fontweight='bold') axes[0].axvline(x=0, color='black', linestyle='--', linewidth=1) axes[0].grid(True, alpha=0.3) # P-values plot p_vals = [p_values.get(f, 1) for f in features] axes[1].barh(y_pos, p_vals, color=colors, alpha=0.7) axes[1].axvline(x=0.05, color='red', linestyle='--', linewidth=2, label='Significance Level (0.05)') axes[1].set_yticks(y_pos) axes[1].set_yticklabels(features) axes[1].set_xlabel('P-value', fontsize=12) axes[1].set_title('P-values for Coefficients', fontsize=14, fontweight='bold') axes[1].set_xlim([0, max(p_vals) * 1.1]) axes[1].legend() axes[1].grid(True, alpha=0.3) plt.suptitle(f'Regression Analysis Results (R² = {regression_results.get("r_squared", 0):.3f})', fontsize=16, fontweight='bold') plt.tight_layout() plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight') plt.close()