diff options
| author | Fuwn <[email protected]> | 2025-12-09 23:16:23 -0800 |
|---|---|---|
| committer | Fuwn <[email protected]> | 2025-12-09 23:16:23 -0800 |
| commit | 3ffcdb247df3f56c4c21c6fed83ee1af5fb94224 (patch) | |
| tree | 409fe42bb385ca73bd1b152623465ee098434179 /config.py | |
| download | mathematicalstatisticsproject-main.tar.xz mathematicalstatisticsproject-main.zip | |
Diffstat (limited to 'config.py')
| -rw-r--r-- | config.py | 64 |
1 files changed, 64 insertions, 0 deletions
diff --git a/config.py b/config.py new file mode 100644 index 0000000..40fbedf --- /dev/null +++ b/config.py @@ -0,0 +1,64 @@ +""" +Configuration file for the code metrics analysis project. +""" +import os +from dotenv import load_dotenv + +load_dotenv() + +# GitHub API Configuration +GITHUB_TOKEN = os.getenv('GITHUB_TOKEN', '') # Set your token in .env file +GITHUB_API_BASE = 'https://api.github.com' + +# Analysis Configuration +MAX_REPOSITORIES = 10 # Limit number of repos to analyze +MIN_STARS = 100 # Minimum stars for repository selection +PYTHON_FILE_EXTENSIONS = ['.py'] +EXCLUDE_DIRS = ['__pycache__', '.git', 'venv', 'env', '.venv', 'node_modules', 'tests', 'test'] + +# Curated list of popular Python projects that use semantic commits +# Format: (owner, repo_name) +CURATED_REPOSITORIES = [ + # FastAPI - modern web framework + ('tiangolo', 'fastapi'), + # Requests - HTTP library + ('psf', 'requests'), + # Django REST Framework - API framework + ('encode', 'djangorestframework'), + # Flask - web framework + ('pallets', 'flask'), + # Celery - distributed task queue + ('celery', 'celery'), + # Pydantic - data validation + ('pydantic', 'pydantic'), + # SQLAlchemy - SQL toolkit + ('sqlalchemy', 'sqlalchemy'), + # Pandas - data analysis + ('pandas-dev', 'pandas'), + # NumPy - numerical computing + ('numpy', 'numpy'), + # Scikit-learn - machine learning + ('scikit-learn', 'scikit-learn'), +] + +# Statistical Analysis Configuration +SIGNIFICANCE_LEVEL = 0.05 +CONFIDENCE_LEVEL = 0.95 + +# Output Configuration +OUTPUT_DIR = 'results' +FIGURES_DIR = 'figures' + +# Data Loading Configuration +USE_EXISTING_METRICS = False # If True, load from existing raw_metrics.csv instead of collecting new data +# Set to False to recollect data with fixed cognitive_complexity calculation +RAW_METRICS_FILE = 'results/raw_metrics.csv' # Path to existing raw metrics CSV file + +# Analysis Mode Configuration +FOCUSED_MODE = True # If True, only perform regression analysis and hypothesis testing (t-tests, z-tests) + +# Parallelization Configuration +MAX_WORKERS = None # None = use CPU count, or set to specific number +PARALLEL_REPOS = True # Process repositories in parallel +PARALLEL_FILES = True # Analyze files in parallel + |