from __future__ import annotations

import re


_INVALID_NAME_PHRASES = {
    "telefono",
    "teléfono",
    "fecha egreso",
    "tipo y n",
    "masculino",
    "femenino",
    "paciente",
    "usuario",
}
_INVALID_NAME_TOKENS = {
    "telefono",
    "teléfono",
    "fecha",
    "egreso",
    "tipo",
    "paciente",
    "usuario",
    "masculino",
    "femenino",
    "sexo",
    "identificacion",
    "identificación",
}
_TRAILING_LABEL_TOKENS = {
    "no",
    "caso",
    "edad",
    "sexo",
    "identificacion",
    "identificación",
}
_NAME_JOINERS = {"de", "del", "la", "las", "los", "y"}
_TEXT_PATTERNS = (
    re.compile(
        r"paciente\s*:\s*(?:cc|ti|ce|dni|nit)?\s*[-:]?\s*\d{5,12}\s*-\s*([A-ZÁÉÍÓÚÑ][A-ZÁÉÍÓÚÑ ]{5,80})",
        re.IGNORECASE,
    ),
    re.compile(
        r"(?:nombre(?:\s+del)?\s+paciente|nombre\s+paciente)\s*[:#-]?\s*([A-ZÁÉÍÓÚÑ][A-ZÁÉÍÓÚÑ ]{5,80})",
        re.IGNORECASE,
    ),
    re.compile(
        r"([A-ZÁÉÍÓÚÑ][A-ZÁÉÍÓÚÑ ]{5,80})\s*Nombre(?:\s+del)?\s+Paciente",
        re.IGNORECASE,
    ),
    re.compile(
        r"paciente\s*:\s*([A-ZÁÉÍÓÚÑ][A-ZÁÉÍÓÚÑ ]{5,80})",
        re.IGNORECASE,
    ),
)


def _normalize_spaces(value: str) -> str:
    return re.sub(r"\s+", " ", str(value or "")).strip()


def sanitize_patient_name_candidate(value: str) -> str:
    raw = _normalize_spaces(value).strip(" :#-\n\r\t")
    if not raw:
        return ""

    normalized = raw.lower()
    normalized = re.sub(r"[^a-z0-9áéíóúñ ]+", " ", normalized)
    normalized = re.sub(r"\s+", " ", normalized).strip()
    if not normalized:
        return ""

    if normalized in _INVALID_NAME_PHRASES:
        return ""
    if any(
        phrase in normalized
        for phrase in ("fecha egreso", "tipo y n", "telefono", "teléfono")
    ):
        return ""

    tokens = re.findall(r"[a-záéíóúñ]+", normalized)
    if len(tokens) < 2:
        return ""

    while len(tokens) >= 2 and tokens[-1] in _TRAILING_LABEL_TOKENS:
        tokens.pop()
    if len(tokens) < 2:
        return ""

    non_joiner_tokens = [token for token in tokens if token not in _NAME_JOINERS]
    if len(non_joiner_tokens) < 2:
        return ""
    if any(token in _INVALID_NAME_TOKENS for token in non_joiner_tokens):
        return ""

    return " ".join(token.upper() for token in tokens[:6]).strip()


def extract_patient_name_from_html(analisis_html: str) -> str:
    if not analisis_html:
        return ""

    patterns = (
        re.compile(
            r"<p><b>Nombre del paciente</b>\s*</p>\s*<p>([^<]+)</p>",
            re.IGNORECASE | re.DOTALL,
        ),
        re.compile(
            r"<p>Nombre del paciente[:\s]*</p>\s*<p>([^<]+)</p>",
            re.IGNORECASE | re.DOTALL,
        ),
        re.compile(
            r"<b>Nombre del paciente</b>\s*:?\s*([A-Za-zÀ-ÿ\u00f1\u00d1\s]+)(?:</p>|<br|$)",
            re.IGNORECASE,
        ),
        re.compile(
            r"<p>.*?Nombre.*?paciente.*?:?\s*([A-Za-zÀ-ÿ\u00f1\u00d1\s]{3,50}?)(?:</p>|<br|$)",
            re.IGNORECASE,
        ),
    )
    for pattern in patterns:
        match = pattern.search(analisis_html)
        if not match:
            continue
        candidate = sanitize_patient_name_candidate(match.group(1))
        if candidate:
            return candidate

    lines = analisis_html.splitlines()
    for index, line in enumerate(lines):
        if not re.search(r"nombre.*paciente", line, re.IGNORECASE):
            continue
        for probe in range(index, min(index + 4, len(lines))):
            candidate_match = re.search(
                r">([A-Za-zÀ-ÿ\u00f1\u00d1\s]{5,50})<",
                lines[probe],
            )
            if not candidate_match:
                continue
            candidate = sanitize_patient_name_candidate(candidate_match.group(1))
            if candidate:
                return candidate
    return ""


def extract_patient_name_from_text(text: str) -> str:
    if not text:
        return ""

    for pattern in _TEXT_PATTERNS:
        match = pattern.search(text)
        if not match:
            continue
        candidate = sanitize_patient_name_candidate(match.group(1))
        if candidate:
            return candidate

    lines = [_normalize_spaces(line) for line in text.splitlines()]
    for index, line in enumerate(lines):
        if not line:
            continue

        if re.fullmatch(r"(?:nombre(?:\s+del)?\s+paciente|paciente)", line, re.IGNORECASE):
            for probe in range(index + 1, min(index + 4, len(lines))):
                candidate = sanitize_patient_name_candidate(lines[probe])
                if candidate:
                    return candidate

        if re.search(r"nombre(?:\s+del)?\s+paciente", line, re.IGNORECASE):
            fragments = re.split(r"[:#-]", line, maxsplit=1)
            if len(fragments) == 2:
                candidate = sanitize_patient_name_candidate(fragments[1])
                if candidate:
                    return candidate

    return ""
