diff options
| author | Fuwn <[email protected]> | 2025-12-09 23:16:23 -0800 |
|---|---|---|
| committer | Fuwn <[email protected]> | 2025-12-09 23:16:23 -0800 |
| commit | 3ffcdb247df3f56c4c21c6fed83ee1af5fb94224 (patch) | |
| tree | 409fe42bb385ca73bd1b152623465ee098434179 /visualizer.py | |
| download | mathematicalstatisticsproject-main.tar.xz mathematicalstatisticsproject-main.zip | |
Diffstat (limited to 'visualizer.py')
| -rw-r--r-- | visualizer.py | 255 |
1 files changed, 255 insertions, 0 deletions
diff --git a/visualizer.py b/visualizer.py new file mode 100644 index 0000000..6608037 --- /dev/null +++ b/visualizer.py @@ -0,0 +1,255 @@ +""" +Visualization module for code metrics analysis results. +""" +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +import numpy as np +from pathlib import Path +from typing import Dict, List, Optional +import os + + +class Visualizer: + """Creates visualizations for code metrics analysis.""" + + def __init__(self, output_dir: str = 'figures'): + self.output_dir = Path(output_dir) + self.output_dir.mkdir(exist_ok=True) + sns.set_style("whitegrid") + plt.rcParams['figure.figsize'] = (12, 8) + + def plot_correlation_heatmap(self, df: pd.DataFrame, + complexity_cols: List[str], + issue_cols: List[str], + filename: str = 'correlation_heatmap.png'): + """Create correlation heatmap between complexity and issues.""" + # Filter to existing columns + comp_cols = [col for col in complexity_cols if col in df.columns] + iss_cols = [col for col in issue_cols if col in df.columns] + + if not comp_cols or not iss_cols: + return + + # Calculate correlation matrix + corr_matrix = np.full((len(comp_cols), len(iss_cols)), np.nan) + + for i, comp_col in enumerate(comp_cols): + for j, iss_col in enumerate(iss_cols): + mask = df[[comp_col, iss_col]].notna().all(axis=1) + if mask.sum() >= 3: + corr = df.loc[mask, comp_col].corr(df.loc[mask, iss_col], method='pearson') + if not np.isnan(corr): + corr_matrix[i, j] = corr + + # Check if we have any valid correlations + if np.isnan(corr_matrix).all(): + return + + # Create heatmap + fig, ax = plt.subplots(figsize=(14, 10)) + sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', + center=0, vmin=-1, vmax=1, ax=ax, + xticklabels=iss_cols, yticklabels=comp_cols, + mask=np.isnan(corr_matrix)) + ax.set_title('Correlation Heatmap: Complexity Metrics vs Issue Metrics', + fontsize=16, fontweight='bold') + plt.tight_layout() + plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight') + plt.close() + + def plot_complexity_vs_issues_scatter(self, df: pd.DataFrame, + complexity_col: str, + issue_col: str, + filename: Optional[str] = None): + """Create scatter plot of complexity vs issues.""" + if complexity_col not in df.columns or issue_col not in df.columns: + return + + mask = df[[complexity_col, issue_col]].notna().all(axis=1) + if mask.sum() < 3: + return + + fig, ax = plt.subplots(figsize=(10, 6)) + + x = df.loc[mask, complexity_col] + y = df.loc[mask, issue_col] + + # Remove any infinite or NaN values + valid_mask = np.isfinite(x) & np.isfinite(y) + x = x[valid_mask] + y = y[valid_mask] + + if len(x) < 2: + return + + ax.scatter(x, y, alpha=0.5, s=50) + + # Add regression line (with error handling) + try: + # Check if x has variance (not constant) + if x.std() > 1e-10: + z = np.polyfit(x, y, 1) + p = np.poly1d(z) + x_sorted = np.sort(x) + ax.plot(x_sorted, p(x_sorted), "r--", alpha=0.8, linewidth=2, + label=f'Trend line (slope={z[0]:.3f})') + except (np.linalg.LinAlgError, ValueError, RuntimeError): + # Skip regression line if fitting fails + pass + + ax.set_xlabel(complexity_col.replace('_', ' ').title(), fontsize=12) + ax.set_ylabel(issue_col.replace('_', ' ').title(), fontsize=12) + ax.set_title(f'{complexity_col.replace("_", " ").title()} vs ' + f'{issue_col.replace("_", " ").title()}', + fontsize=14, fontweight='bold') + ax.legend() + ax.grid(True, alpha=0.3) + + plt.tight_layout() + if filename: + plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight') + plt.close() + + def plot_module_complexity_comparison(self, df: pd.DataFrame, + complexity_col: str, + filename: str = 'module_complexity_comparison.png'): + """Compare complexity across different modules.""" + if 'module' not in df.columns or complexity_col not in df.columns: + return + + mask = df[complexity_col].notna() + if mask.sum() == 0: + return + + fig, axes = plt.subplots(2, 1, figsize=(14, 10)) + + # Box plot + modules = df.loc[mask, 'module'].unique() + data_by_module = [df.loc[mask & (df['module'] == mod), complexity_col].values + for mod in modules] + + axes[0].boxplot(data_by_module, labels=modules) + axes[0].set_ylabel(complexity_col.replace('_', ' ').title(), fontsize=12) + axes[0].set_title(f'{complexity_col.replace("_", " ").title()} by Module', + fontsize=14, fontweight='bold') + axes[0].tick_params(axis='x', rotation=45) + axes[0].grid(True, alpha=0.3) + + # Bar plot of means + module_means = df.loc[mask].groupby('module')[complexity_col].mean().sort_values(ascending=False) + axes[1].bar(range(len(module_means)), module_means.values) + axes[1].set_xticks(range(len(module_means))) + axes[1].set_xticklabels(module_means.index, rotation=45, ha='right') + axes[1].set_ylabel(f'Mean {complexity_col.replace("_", " ").title()}', fontsize=12) + axes[1].set_title(f'Average {complexity_col.replace("_", " ").title()} by Module', + fontsize=14, fontweight='bold') + axes[1].grid(True, alpha=0.3, axis='y') + + plt.tight_layout() + plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight') + plt.close() + + def plot_distribution_analysis(self, df: pd.DataFrame, + metric_col: str, + filename: str = 'distribution_analysis.png'): + """Plot distribution analysis for a metric.""" + if metric_col not in df.columns: + return + + data = df[metric_col].dropna() + if len(data) == 0: + return + + fig, axes = plt.subplots(2, 2, figsize=(14, 10)) + + # Histogram + axes[0, 0].hist(data, bins=30, edgecolor='black', alpha=0.7) + axes[0, 0].set_xlabel(metric_col.replace('_', ' ').title(), fontsize=12) + axes[0, 0].set_ylabel('Frequency', fontsize=12) + axes[0, 0].set_title('Histogram', fontsize=12, fontweight='bold') + axes[0, 0].grid(True, alpha=0.3) + + # Q-Q plot + from scipy import stats + stats.probplot(data, dist="norm", plot=axes[0, 1]) + axes[0, 1].set_title('Q-Q Plot (Normal Distribution)', fontsize=12, fontweight='bold') + axes[0, 1].grid(True, alpha=0.3) + + # Box plot + axes[1, 0].boxplot(data, vert=True) + axes[1, 0].set_ylabel(metric_col.replace('_', ' ').title(), fontsize=12) + axes[1, 0].set_title('Box Plot', fontsize=12, fontweight='bold') + axes[1, 0].grid(True, alpha=0.3) + + # Cumulative distribution + sorted_data = np.sort(data) + cumulative = np.arange(1, len(sorted_data) + 1) / len(sorted_data) + axes[1, 1].plot(sorted_data, cumulative, linewidth=2) + axes[1, 1].set_xlabel(metric_col.replace('_', ' ').title(), fontsize=12) + axes[1, 1].set_ylabel('Cumulative Probability', fontsize=12) + axes[1, 1].set_title('Cumulative Distribution Function', fontsize=12, fontweight='bold') + axes[1, 1].grid(True, alpha=0.3) + + plt.suptitle(f'Distribution Analysis: {metric_col.replace("_", " ").title()}', + fontsize=16, fontweight='bold') + plt.tight_layout() + plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight') + plt.close() + + def plot_regression_results(self, regression_results: Dict, + filename: str = 'regression_results.png'): + """Visualize regression analysis results.""" + if 'coefficients' not in regression_results: + return + + coefficients = regression_results['coefficients'] + p_values = regression_results.get('p_values', {}) + ci = regression_results.get('confidence_intervals', {}) + + if not coefficients: + return + + fig, axes = plt.subplots(1, 2, figsize=(14, 6)) + + # Coefficient plot with confidence intervals + features = list(coefficients.keys()) + coef_values = list(coefficients.values()) + colors = ['red' if p_values.get(f, 1) < 0.05 else 'gray' + for f in features] + + y_pos = np.arange(len(features)) + axes[0].barh(y_pos, coef_values, color=colors, alpha=0.7) + + # Add confidence intervals + for i, feature in enumerate(features): + if feature in ci: + ci_lower, ci_upper = ci[feature] + axes[0].plot([ci_lower, ci_upper], [i, i], 'k-', linewidth=2) + + axes[0].set_yticks(y_pos) + axes[0].set_yticklabels(features) + axes[0].set_xlabel('Coefficient Value', fontsize=12) + axes[0].set_title('Regression Coefficients with 95% CI', fontsize=14, fontweight='bold') + axes[0].axvline(x=0, color='black', linestyle='--', linewidth=1) + axes[0].grid(True, alpha=0.3) + + # P-values plot + p_vals = [p_values.get(f, 1) for f in features] + axes[1].barh(y_pos, p_vals, color=colors, alpha=0.7) + axes[1].axvline(x=0.05, color='red', linestyle='--', linewidth=2, + label='Significance Level (0.05)') + axes[1].set_yticks(y_pos) + axes[1].set_yticklabels(features) + axes[1].set_xlabel('P-value', fontsize=12) + axes[1].set_title('P-values for Coefficients', fontsize=14, fontweight='bold') + axes[1].set_xlim([0, max(p_vals) * 1.1]) + axes[1].legend() + axes[1].grid(True, alpha=0.3) + + plt.suptitle(f'Regression Analysis Results (R² = {regression_results.get("r_squared", 0):.3f})', + fontsize=16, fontweight='bold') + plt.tight_layout() + plt.savefig(self.output_dir / filename, dpi=300, bbox_inches='tight') + plt.close() + |