#!/usr/bin/env python3
"""
clean_data.py — Data cleaning utility for the data-insights skill.

Usage:
    python clean_data.py input.csv output.csv [--report]

This script:
1. Reads a CSV file
2. Detects and fixes common data quality issues
3. Outputs a cleaned CSV
4. Optionally prints a cleaning report

Cleaning steps:
- Remove completely empty rows
- Strip whitespace from all string values
- Normalize date formats (attempts to parse various formats)
- Fill missing numeric values with column median
- Remove duplicate rows
- Standardize column names (lowercase, underscores)
"""

import csv
import sys
import re
from datetime import datetime
from collections import defaultdict


def standardize_column_name(name):
    """Convert column names to lowercase_with_underscores."""
    name = name.strip()
    name = re.sub(r'[^\w\s]', '', name)
    name = re.sub(r'\s+', '_', name)
    return name.lower()


def is_numeric(value):
    """Check if a string value is numeric."""
    try:
        float(value.replace(',', ''))
        return True
    except (ValueError, AttributeError):
        return False


def parse_date(value):
    """Try to parse a date string in common formats."""
    formats = [
        '%Y-%m-%d', '%m/%d/%Y', '%d/%m/%Y',
        '%b %Y', '%B %Y', '%Y-%m',
        '%m-%d-%Y', '%d-%m-%Y',
        '%b %d, %Y', '%B %d, %Y',
    ]
    for fmt in formats:
        try:
            return datetime.strptime(value.strip(), fmt).strftime('%Y-%m-%d')
        except ValueError:
            continue
    return value


def clean_csv(input_path, output_path, show_report=False):
    """Main cleaning function."""
    report = {
        'rows_read': 0,
        'rows_after_clean': 0,
        'empty_rows_removed': 0,
        'duplicates_removed': 0,
        'missing_values_filled': 0,
        'dates_normalized': 0,
    }

    # Read data
    with open(input_path, 'r', encoding='utf-8-sig') as f:
        reader = csv.DictReader(f)
        original_headers = reader.fieldnames
        rows = list(reader)
    report['rows_read'] = len(rows)

    # Standardize column names
    header_map = {h: standardize_column_name(h) for h in original_headers}
    clean_headers = [header_map[h] for h in original_headers]

    # Remap rows to new headers
    clean_rows = []
    for row in rows:
        new_row = {header_map[k]: v.strip() if v else '' for k, v in row.items()}
        clean_rows.append(new_row)

    # Remove empty rows
    non_empty = [r for r in clean_rows if any(v for v in r.values())]
    report['empty_rows_removed'] = len(clean_rows) - len(non_empty)
    clean_rows = non_empty

    # Detect column types
    numeric_cols = []
    date_cols = []
    for col in clean_headers:
        values = [r[col] for r in clean_rows if r[col]]
        if not values:
            continue
        if all(is_numeric(v) for v in values[:10]):
            numeric_cols.append(col)
        elif any(parse_date(v) != v for v in values[:5]):
            date_cols.append(col)

    # Normalize dates
    for col in date_cols:
        for row in clean_rows:
            if row[col]:
                normalized = parse_date(row[col])
                if normalized != row[col]:
                    report['dates_normalized'] += 1
                    row[col] = normalized

    # Fill missing numeric values with column median
    for col in numeric_cols:
        values = sorted([float(r[col].replace(',', '')) for r in clean_rows if r[col]])
        if values:
            median = values[len(values) // 2]
            for row in clean_rows:
                if not row[col]:
                    row[col] = str(median)
                    report['missing_values_filled'] += 1

    # Remove duplicates
    seen = set()
    unique_rows = []
    for row in clean_rows:
        key = tuple(row.values())
        if key not in seen:
            seen.add(key)
            unique_rows.append(row)
    report['duplicates_removed'] = len(clean_rows) - len(unique_rows)
    clean_rows = unique_rows

    report['rows_after_clean'] = len(clean_rows)

    # Write output
    with open(output_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=clean_headers)
        writer.writeheader()
        writer.writerows(clean_rows)

    if show_report:
        print("=== Data Cleaning Report ===")
        print(f"  Rows read:              {report['rows_read']}")
        print(f"  Empty rows removed:     {report['empty_rows_removed']}")
        print(f"  Duplicates removed:     {report['duplicates_removed']}")
        print(f"  Missing values filled:  {report['missing_values_filled']}")
        print(f"  Dates normalized:       {report['dates_normalized']}")
        print(f"  Rows after cleaning:    {report['rows_after_clean']}")
        print(f"  Output: {output_path}")

    return report


if __name__ == '__main__':
    if len(sys.argv) < 3:
        print("Usage: python clean_data.py input.csv output.csv [--report]")
        sys.exit(1)

    input_file = sys.argv[1]
    output_file = sys.argv[2]
    report_flag = '--report' in sys.argv

    clean_csv(input_file, output_file, show_report=report_flag)
