aboutsummaryrefslogtreecommitdiff
path: root/config.py
diff options
context:
space:
mode:
authorFuwn <[email protected]>2025-12-09 23:16:23 -0800
committerFuwn <[email protected]>2025-12-09 23:16:23 -0800
commit3ffcdb247df3f56c4c21c6fed83ee1af5fb94224 (patch)
tree409fe42bb385ca73bd1b152623465ee098434179 /config.py
downloadmathematicalstatisticsproject-3ffcdb247df3f56c4c21c6fed83ee1af5fb94224.tar.xz
mathematicalstatisticsproject-3ffcdb247df3f56c4c21c6fed83ee1af5fb94224.zip
feat: Initial commitHEADmain
Diffstat (limited to 'config.py')
-rw-r--r--config.py64
1 files changed, 64 insertions, 0 deletions
diff --git a/config.py b/config.py
new file mode 100644
index 0000000..40fbedf
--- /dev/null
+++ b/config.py
@@ -0,0 +1,64 @@
+"""
+Configuration file for the code metrics analysis project.
+"""
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# GitHub API Configuration
+GITHUB_TOKEN = os.getenv('GITHUB_TOKEN', '') # Set your token in .env file
+GITHUB_API_BASE = 'https://api.github.com'
+
+# Analysis Configuration
+MAX_REPOSITORIES = 10 # Limit number of repos to analyze
+MIN_STARS = 100 # Minimum stars for repository selection
+PYTHON_FILE_EXTENSIONS = ['.py']
+EXCLUDE_DIRS = ['__pycache__', '.git', 'venv', 'env', '.venv', 'node_modules', 'tests', 'test']
+
+# Curated list of popular Python projects that use semantic commits
+# Format: (owner, repo_name)
+CURATED_REPOSITORIES = [
+ # FastAPI - modern web framework
+ ('tiangolo', 'fastapi'),
+ # Requests - HTTP library
+ ('psf', 'requests'),
+ # Django REST Framework - API framework
+ ('encode', 'djangorestframework'),
+ # Flask - web framework
+ ('pallets', 'flask'),
+ # Celery - distributed task queue
+ ('celery', 'celery'),
+ # Pydantic - data validation
+ ('pydantic', 'pydantic'),
+ # SQLAlchemy - SQL toolkit
+ ('sqlalchemy', 'sqlalchemy'),
+ # Pandas - data analysis
+ ('pandas-dev', 'pandas'),
+ # NumPy - numerical computing
+ ('numpy', 'numpy'),
+ # Scikit-learn - machine learning
+ ('scikit-learn', 'scikit-learn'),
+]
+
+# Statistical Analysis Configuration
+SIGNIFICANCE_LEVEL = 0.05
+CONFIDENCE_LEVEL = 0.95
+
+# Output Configuration
+OUTPUT_DIR = 'results'
+FIGURES_DIR = 'figures'
+
+# Data Loading Configuration
+USE_EXISTING_METRICS = False # If True, load from existing raw_metrics.csv instead of collecting new data
+# Set to False to recollect data with fixed cognitive_complexity calculation
+RAW_METRICS_FILE = 'results/raw_metrics.csv' # Path to existing raw metrics CSV file
+
+# Analysis Mode Configuration
+FOCUSED_MODE = True # If True, only perform regression analysis and hypothesis testing (t-tests, z-tests)
+
+# Parallelization Configuration
+MAX_WORKERS = None # None = use CPU count, or set to specific number
+PARALLEL_REPOS = True # Process repositories in parallel
+PARALLEL_FILES = True # Analyze files in parallel
+