1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
|
"""
Configuration file for the code metrics analysis project.
"""
import os
from dotenv import load_dotenv
load_dotenv()
# GitHub API Configuration
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN', '') # Set your token in .env file
GITHUB_API_BASE = 'https://api.github.com'
# Analysis Configuration
MAX_REPOSITORIES = 10 # Limit number of repos to analyze
MIN_STARS = 100 # Minimum stars for repository selection
PYTHON_FILE_EXTENSIONS = ['.py']
EXCLUDE_DIRS = ['__pycache__', '.git', 'venv', 'env', '.venv', 'node_modules', 'tests', 'test']
# Curated list of popular Python projects that use semantic commits
# Format: (owner, repo_name)
CURATED_REPOSITORIES = [
# FastAPI - modern web framework
('tiangolo', 'fastapi'),
# Requests - HTTP library
('psf', 'requests'),
# Django REST Framework - API framework
('encode', 'djangorestframework'),
# Flask - web framework
('pallets', 'flask'),
# Celery - distributed task queue
('celery', 'celery'),
# Pydantic - data validation
('pydantic', 'pydantic'),
# SQLAlchemy - SQL toolkit
('sqlalchemy', 'sqlalchemy'),
# Pandas - data analysis
('pandas-dev', 'pandas'),
# NumPy - numerical computing
('numpy', 'numpy'),
# Scikit-learn - machine learning
('scikit-learn', 'scikit-learn'),
]
# Statistical Analysis Configuration
SIGNIFICANCE_LEVEL = 0.05
CONFIDENCE_LEVEL = 0.95
# Output Configuration
OUTPUT_DIR = 'results'
FIGURES_DIR = 'figures'
# Data Loading Configuration
USE_EXISTING_METRICS = False # If True, load from existing raw_metrics.csv instead of collecting new data
# Set to False to recollect data with fixed cognitive_complexity calculation
RAW_METRICS_FILE = 'results/raw_metrics.csv' # Path to existing raw metrics CSV file
# Analysis Mode Configuration
FOCUSED_MODE = True # If True, only perform regression analysis and hypothesis testing (t-tests, z-tests)
# Parallelization Configuration
MAX_WORKERS = None # None = use CPU count, or set to specific number
PARALLEL_REPOS = True # Process repositories in parallel
PARALLEL_FILES = True # Analyze files in parallel
|