#!/usr/bin/env python3
"""SuaXe V2 — Phase 1 schema bootstrap & JSON crawl import.

Single-file CLI gom 4 lệnh phục vụ vòng đời demo Phase 1:

    init-schema  Chạy `phase1_schema.sql` (3 bảng + indexes + seed shop_types).
    import-json  Parse file JSON crawl Google Maps, INSERT shops + reviews.
                 Whitelist 10 loại `address_type`, dedupe theo origin_ref.
    reset        DROP 3 bảng → init-schema → import-json (full rebuild).
    count        Đếm row mỗi bảng (sanity check).

Cấu hình DB qua --db-url hoặc env DATABASE_URL.
Default: postgresql://postgres:postgres@localhost:5432/suaxe

Dependencies:
    pip install "psycopg[binary]>=3.1"

Examples:
    export DATABASE_URL=postgresql://postgres:postgres@localhost:5432/suaxe
    python phase1_import.py init-schema
    python phase1_import.py import-json --json ../sua_xe_in_quan_hoan_kiem_ha_noi_17619123390984886.json
    python phase1_import.py count
    python phase1_import.py reset --json ../sua_xe_..._.json --yes
"""

from __future__ import annotations

import argparse
import json
import os
import re
import secrets
import sys
import time
import unicodedata
import uuid
from pathlib import Path

import psycopg
from psycopg.types.json import Jsonb

# ----------------------------------------------------------------------------
# Constants
# ----------------------------------------------------------------------------

SCHEMA_FILE = Path(__file__).parent / "phase1_schema.sql"
DEFAULT_DB_URL = "postgresql://postgres:postgres@localhost:5432/suaxe"

# Whitelist `address_type` → shop_types.code. Khác → bỏ qua.
# Thêm 1 entry typo trong crawl: "sửa chữ" thiếu "a".
TYPE_MAP: dict[str, str] = {
    "Cửa hàng sửa chữa xe máy":          "motorcycle_repair",
    "Xưởng sửa chữa xe tay ga":          "scooter_repair",
    "Cửa hàng phụ tùng xe máy":          "motorcycle_parts",
    "Xưởng sửa chữa ô tô":               "car_repair",
    "Cửa hàng bảo trì và sửa chữa ô tô": "car_maintenance",
    "Cửa hàng bảo trì và sửa chữ ô tô":  "car_maintenance",  # crawl typo
    "Cửa hàng săm lốp":                  "tire_shop",
    "Dịch vụ độ xe ô tô":                "car_tuning",
    "Dịch vụ độ xe tay ga":              "scooter_tuning",
    "Chuyên độ xe":                      "modification",
    "Cửa hàng phụ tùng xe ô tô":         "motorcycle_parts_car",
}

# Lat/lng từ Google Maps URL: `!3d{lat}!4d{lng}`
LL_REGEX = re.compile(r"!3d([\d.\-]+)!4d([\d.\-]+)")

# Place CID làm dedupe key: `:0x{hex}!`
ORIGIN_REF_REGEX = re.compile(r":(0x[0-9a-fA-F]+)!")


# ----------------------------------------------------------------------------
# Helpers
# ----------------------------------------------------------------------------

def uuid7() -> uuid.UUID:
    """RFC 9562 UUID v7 — time-ordered, sortable. App-side generation.

    Layout (MSB first): 48-bit ms | 4-bit ver=7 | 12-bit rand_a |
    2-bit variant=10 | 62-bit rand_b.
    """
    ms = int(time.time() * 1000) & 0xFFFFFFFFFFFF
    rand_a = secrets.randbits(12)
    rand_b = secrets.randbits(62)
    val = (
        (ms << 80)
        | (0x7 << 76)
        | (rand_a << 64)
        | (0b10 << 62)
        | rand_b
    )
    return uuid.UUID(int=val)


def normalize_name(s: str) -> str:
    """lowercase + bỏ dấu (NFD strip). Dùng cho FTS + LIKE search."""
    nfd = unicodedata.normalize("NFD", s)
    no_diacritics = "".join(ch for ch in nfd if unicodedata.category(ch) != "Mn")
    # 'đ' không bị decompose bởi NFD — handle thủ công.
    return no_diacritics.replace("đ", "d").replace("Đ", "D").lower().strip()


# Private Use Area — Google Maps crawl gắn icon ghosts (📍, ☎) ở -.
PUA_REGEX = re.compile(r"[-]")


def clean_text(s: str | None) -> str | None:
    if s is None:
        return None
    no_pua = PUA_REGEX.sub("", s)
    t = re.sub(r"\s+", " ", no_pua).strip()
    return t or None


def clean_phone(s: str | None) -> str | None:
    return clean_text(s)


def parse_lat_lng(url: str) -> tuple[float, float] | None:
    m = LL_REGEX.search(url or "")
    if not m:
        return None
    try:
        return float(m.group(1)), float(m.group(2))
    except ValueError:
        return None


def parse_origin_ref(url: str) -> str | None:
    m = ORIGIN_REF_REGEX.search(url or "")
    return m.group(1) if m else None


def parse_shop_record(raw: dict) -> dict | None:
    """Parse 1 entry JSON crawl → dict ready to INSERT, hoặc None nếu skip.

    Skip rules:
      - address_type không thuộc whitelist 10 loại
      - URL rỗng (không dedupe được)
      - không parse được origin_ref (URL malformed)
    """
    raw_type = (raw.get("address_type") or "").strip()
    code = TYPE_MAP.get(raw_type)
    if code is None:
        return None

    url = (raw.get("url") or "").strip()
    if not url:
        return None

    origin_ref = parse_origin_ref(url)
    if not origin_ref:
        return None

    name = clean_text(raw.get("name")) or ""
    if not name:
        return None

    address = clean_text(raw.get("address")) or ""

    ll = parse_lat_lng(url)
    location_wkt = f"POINT({ll[1]} {ll[0]})" if ll else None

    wh_raw = raw.get("time_working") or {}
    open_t = (wh_raw.get("open_time") or "").strip()
    close_t = (wh_raw.get("close_time") or "").strip()
    working_hours = {"open": open_t, "close": close_t} if open_t and close_t else None

    images_raw = [u for u in (raw.get("images") or []) if isinstance(u, str) and u]
    images = [
        {"url": u, "is_primary": i == 0, "order": i}
        for i, u in enumerate(images_raw)
    ] or None

    rate = raw.get("rate") or {}
    rating_avg = float(rate.get("rate_point") or 0)
    rating_count = int(rate.get("rate_count") or 0)

    reviews = []
    for r in raw.get("reviews") or []:
        if not isinstance(r, dict):
            continue
        username = clean_text(r.get("username"))
        rp = r.get("rate_point")
        if not username or not isinstance(rp, int) or not (1 <= rp <= 5):
            continue
        reviews.append({
            "username": username,
            "rate_point": rp,
            "content": clean_text(r.get("content")),
        })

    verification = (raw.get("verification") or "").strip() or "unverified"

    return {
        "name": name,
        "name_normalized": normalize_name(name),
        "shop_type_code": code,
        "address": address,
        "phone": clean_phone(raw.get("phone")),
        "website": clean_text(raw.get("website")),
        "google_maps_url": url,
        "location_wkt": location_wkt,
        "working_hours": working_hours,
        "images": images,
        "rating_avg": rating_avg,
        "rating_count": rating_count,
        "origin_ref": origin_ref,
        "verification": verification,
        "reviews": reviews,
    }


# ----------------------------------------------------------------------------
# DB ops
# ----------------------------------------------------------------------------

def load_shop_type_map(cur) -> dict[str, int]:
    cur.execute("SELECT code, id FROM shop_types WHERE is_active = true")
    return {row[0]: row[1] for row in cur.fetchall()}


def insert_shop(cur, shop: dict, type_id: int) -> uuid.UUID | None:
    """INSERT shop + reviews. Trả về shop UUID, None nếu duplicate."""
    shop_id = uuid7()
    cur.execute(
        """
        INSERT INTO shops (
            id, name, name_normalized, shop_type_id, address, phone, website,
            google_maps_url, location, working_hours, images,
            rating_avg, rating_count, verification, origin, origin_ref, status
        ) VALUES (
            %s, %s, %s, %s, %s, %s, %s,
            %s, ST_GeogFromText(%s), %s, %s,
            %s, %s, %s, %s, %s, %s
        )
        ON CONFLICT (origin_ref) WHERE origin_ref IS NOT NULL DO NOTHING
        RETURNING id
        """,
        (
            shop_id, shop["name"], shop["name_normalized"], type_id,
            shop["address"], shop["phone"], shop["website"],
            shop["google_maps_url"], shop["location_wkt"],
            Jsonb(shop["working_hours"]) if shop["working_hours"] else None,
            Jsonb(shop["images"]) if shop["images"] else None,
            shop["rating_avg"], shop["rating_count"],
            shop["verification"], "gmaps_crawl", shop["origin_ref"], "active",
        ),
    )
    row = cur.fetchone()
    if row is None:
        return None  # duplicate origin_ref

    inserted_id = row[0]
    for r in shop["reviews"]:
        cur.execute(
            """
            INSERT INTO shop_reviews (id, shop_id, username, rate_point, content, source)
            VALUES (%s, %s, %s, %s, %s, %s)
            """,
            (uuid7(), inserted_id, r["username"], r["rate_point"], r["content"], "gmaps_crawl"),
        )
    return inserted_id


# ----------------------------------------------------------------------------
# Commands
# ----------------------------------------------------------------------------

def cmd_init_schema(args) -> int:
    if not SCHEMA_FILE.exists():
        print(f"!! Không tìm thấy {SCHEMA_FILE}", file=sys.stderr)
        return 1
    ddl = SCHEMA_FILE.read_text(encoding="utf-8")
    with psycopg.connect(args.db_url, autocommit=True) as conn:
        with conn.cursor() as cur:
            cur.execute(ddl)
    print(f"✓ Schema applied từ {SCHEMA_FILE.name}")
    return 0


def cmd_import_json(args) -> int:
    path = Path(args.json)
    if not path.exists():
        print(f"!! Không tìm thấy file JSON: {path}", file=sys.stderr)
        return 1
    raw_list = json.loads(path.read_text(encoding="utf-8"))
    if not isinstance(raw_list, list):
        print("!! JSON crawl phải là array ở root", file=sys.stderr)
        return 1

    parsed = []
    skipped_whitelist = 0
    skipped_invalid = 0
    for raw in raw_list:
        if not isinstance(raw, dict):
            skipped_invalid += 1
            continue
        rec = parse_shop_record(raw)
        if rec is None:
            raw_type = (raw.get("address_type") or "").strip()
            if raw_type and raw_type not in TYPE_MAP:
                skipped_whitelist += 1
            else:
                skipped_invalid += 1
            continue
        parsed.append(rec)

    print(f"Parse: {len(raw_list)} entries → {len(parsed)} shop hợp lệ "
          f"(bỏ {skipped_whitelist} không thuộc whitelist, {skipped_invalid} invalid)")

    if args.dry_run:
        for p in parsed[:5]:
            print(f"  • {p['name']}  [{p['shop_type_code']}]  {p['origin_ref']}")
        if len(parsed) > 5:
            print(f"  … +{len(parsed) - 5} shop nữa")
        return 0

    inserted = 0
    duplicates = 0
    total_reviews = 0
    with psycopg.connect(args.db_url) as conn:
        with conn.cursor() as cur:
            type_map = load_shop_type_map(cur)
            for rec in parsed:
                tid = type_map.get(rec["shop_type_code"])
                if tid is None:
                    print(f"!! Bỏ qua: shop_types thiếu '{rec['shop_type_code']}' "
                          f"(chạy init-schema trước)", file=sys.stderr)
                    continue
                shop_id = insert_shop(cur, rec, tid)
                if shop_id is None:
                    duplicates += 1
                else:
                    inserted += 1
                    total_reviews += len(rec["reviews"])
        conn.commit()

    print(f"✓ Insert: {inserted} shop mới, {duplicates} duplicate (bỏ qua), "
          f"{total_reviews} review.")
    return 0


def cmd_reset(args) -> int:
    if not args.yes:
        ans = input("⚠  DROP shops, shop_reviews, shop_types — gõ 'yes' để xác nhận: ")
        if ans.strip().lower() != "yes":
            print("Cancelled.")
            return 1
    with psycopg.connect(args.db_url, autocommit=True) as conn:
        with conn.cursor() as cur:
            cur.execute("DROP TABLE IF EXISTS shop_reviews CASCADE")
            cur.execute("DROP TABLE IF EXISTS shops CASCADE")
            cur.execute("DROP TABLE IF EXISTS shop_types CASCADE")
    print("✓ Dropped 3 bảng.")
    rc = cmd_init_schema(args)
    if rc != 0:
        return rc
    return cmd_import_json(args)


def cmd_count(args) -> int:
    with psycopg.connect(args.db_url) as conn:
        with conn.cursor() as cur:
            counts = {}
            for tbl in ("shop_types", "shops", "shop_reviews"):
                cur.execute(f"SELECT COUNT(*) FROM {tbl}")
                counts[tbl] = cur.fetchone()[0]
    for k, v in counts.items():
        print(f"  {k:14s} {v:>6d}")
    return 0


# ----------------------------------------------------------------------------
# CLI
# ----------------------------------------------------------------------------

def _add_db_arg(p: argparse.ArgumentParser) -> None:
    p.add_argument(
        "--db-url",
        default=os.getenv("DATABASE_URL", DEFAULT_DB_URL),
        help="Postgres connection string (default: env DATABASE_URL hoặc localhost)",
    )


def main() -> int:
    parser = argparse.ArgumentParser(
        description="SuaXe V2 Phase 1 — schema bootstrap & JSON crawl import.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    sub = parser.add_subparsers(dest="cmd", required=True)

    p_init = sub.add_parser("init-schema", help="Chạy DDL phase1_schema.sql.")
    _add_db_arg(p_init)

    p_import = sub.add_parser("import-json", help="Parse JSON crawl → INSERT.")
    _add_db_arg(p_import)
    p_import.add_argument("--json", required=True, help="Path tới file JSON crawl.")
    p_import.add_argument("--dry-run", action="store_true",
                          help="Parse + preview, không INSERT.")

    p_reset = sub.add_parser("reset", help="DROP 3 bảng → init-schema → import-json.")
    _add_db_arg(p_reset)
    p_reset.add_argument("--json", required=True, help="Path tới file JSON crawl.")
    p_reset.add_argument("--dry-run", action="store_true")
    p_reset.add_argument("--yes", action="store_true", help="Skip confirmation.")

    p_count = sub.add_parser("count", help="Đếm row mỗi bảng.")
    _add_db_arg(p_count)

    args = parser.parse_args()
    handlers = {
        "init-schema": cmd_init_schema,
        "import-json": cmd_import_json,
        "reset":       cmd_reset,
        "count":       cmd_count,
    }
    return handlers[args.cmd](args)


if __name__ == "__main__":
    sys.exit(main())
