feat: Initial commitHEAD main

author: Fuwn <[email protected]> 2025-12-09 23:16:23 -0800
committer: Fuwn <[email protected]> 2025-12-09 23:16:23 -0800
commit: 3ffcdb247df3f56c4c21c6fed83ee1af5fb94224 (patch)
tree: 409fe42bb385ca73bd1b152623465ee098434179 /config.py
download: mathematicalstatisticsproject-main.tar.xz
mathematicalstatisticsproject-main.zip
1 files changed, 64 insertions, 0 deletions
diff --git a/config.py b/config.py
new file mode 100644
index 0000000..40fbedf
--- /dev/null
+++ b/config.py
@@ -0,0 +1,64 @@
+"""
+Configuration file for the code metrics analysis project.
+"""
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# GitHub API Configuration
+GITHUB_TOKEN = os.getenv('GITHUB_TOKEN', '')  # Set your token in .env file
+GITHUB_API_BASE = 'https://api.github.com'
+
+# Analysis Configuration
+MAX_REPOSITORIES = 10  # Limit number of repos to analyze
+MIN_STARS = 100  # Minimum stars for repository selection
+PYTHON_FILE_EXTENSIONS = ['.py']
+EXCLUDE_DIRS = ['__pycache__', '.git', 'venv', 'env', '.venv', 'node_modules', 'tests', 'test']
+
+# Curated list of popular Python projects that use semantic commits
+# Format: (owner, repo_name)
+CURATED_REPOSITORIES = [
+    # FastAPI - modern web framework
+    ('tiangolo', 'fastapi'),
+    # Requests - HTTP library
+    ('psf', 'requests'),
+    # Django REST Framework - API framework
+    ('encode', 'djangorestframework'),
+    # Flask - web framework
+    ('pallets', 'flask'),
+    # Celery - distributed task queue
+    ('celery', 'celery'),
+    # Pydantic - data validation
+    ('pydantic', 'pydantic'),
+    # SQLAlchemy - SQL toolkit
+    ('sqlalchemy', 'sqlalchemy'),
+    # Pandas - data analysis
+    ('pandas-dev', 'pandas'),
+    # NumPy - numerical computing
+    ('numpy', 'numpy'),
+    # Scikit-learn - machine learning
+    ('scikit-learn', 'scikit-learn'),
+]
+
+# Statistical Analysis Configuration
+SIGNIFICANCE_LEVEL = 0.05
+CONFIDENCE_LEVEL = 0.95
+
+# Output Configuration
+OUTPUT_DIR = 'results'
+FIGURES_DIR = 'figures'
+
+# Data Loading Configuration
+USE_EXISTING_METRICS = False  # If True, load from existing raw_metrics.csv instead of collecting new data
+# Set to False to recollect data with fixed cognitive_complexity calculation
+RAW_METRICS_FILE = 'results/raw_metrics.csv'  # Path to existing raw metrics CSV file
+
+# Analysis Mode Configuration
+FOCUSED_MODE = True  # If True, only perform regression analysis and hypothesis testing (t-tests, z-tests)
+
+# Parallelization Configuration
+MAX_WORKERS = None  # None = use CPU count, or set to specific number
+PARALLEL_REPOS = True  # Process repositories in parallel
+PARALLEL_FILES = True  # Analyze files in parallel
+
author	Fuwn <[email protected]>	2025-12-09 23:16:23 -0800
committer	Fuwn <[email protected]>	2025-12-09 23:16:23 -0800
commit	3ffcdb247df3f56c4c21c6fed83ee1af5fb94224 (patch)
tree	409fe42bb385ca73bd1b152623465ee098434179 /config.py
download	mathematicalstatisticsproject-main.tar.xz mathematicalstatisticsproject-main.zip