config.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

"""
Configuration file for the code metrics analysis project.
"""
import os
from dotenv import load_dotenv

load_dotenv()

# GitHub API Configuration
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN', '')  # Set your token in .env file
GITHUB_API_BASE = 'https://api.github.com'

# Analysis Configuration
MAX_REPOSITORIES = 10  # Limit number of repos to analyze
MIN_STARS = 100  # Minimum stars for repository selection
PYTHON_FILE_EXTENSIONS = ['.py']
EXCLUDE_DIRS = ['__pycache__', '.git', 'venv', 'env', '.venv', 'node_modules', 'tests', 'test']

# Curated list of popular Python projects that use semantic commits
# Format: (owner, repo_name)
CURATED_REPOSITORIES = [
    # FastAPI - modern web framework
    ('tiangolo', 'fastapi'),
    # Requests - HTTP library
    ('psf', 'requests'),
    # Django REST Framework - API framework
    ('encode', 'djangorestframework'),
    # Flask - web framework
    ('pallets', 'flask'),
    # Celery - distributed task queue
    ('celery', 'celery'),
    # Pydantic - data validation
    ('pydantic', 'pydantic'),
    # SQLAlchemy - SQL toolkit
    ('sqlalchemy', 'sqlalchemy'),
    # Pandas - data analysis
    ('pandas-dev', 'pandas'),
    # NumPy - numerical computing
    ('numpy', 'numpy'),
    # Scikit-learn - machine learning
    ('scikit-learn', 'scikit-learn'),
]

# Statistical Analysis Configuration
SIGNIFICANCE_LEVEL = 0.05
CONFIDENCE_LEVEL = 0.95

# Output Configuration
OUTPUT_DIR = 'results'
FIGURES_DIR = 'figures'

# Data Loading Configuration
USE_EXISTING_METRICS = False  # If True, load from existing raw_metrics.csv instead of collecting new data
# Set to False to recollect data with fixed cognitive_complexity calculation
RAW_METRICS_FILE = 'results/raw_metrics.csv'  # Path to existing raw metrics CSV file

# Analysis Mode Configuration
FOCUSED_MODE = True  # If True, only perform regression analysis and hypothesis testing (t-tests, z-tests)

# Parallelization Configuration
MAX_WORKERS = None  # None = use CPU count, or set to specific number
PARALLEL_REPOS = True  # Process repositories in parallel
PARALLEL_FILES = True  # Analyze files in parallel