import os
from typing import Optional
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from PyPDF2 import PdfReader
import argparse
from modules.processing.embeddings import get_hf_embeddings

EMBEDDINGS_MODEL = os.getenv('EMBEDDINGS_MODEL', 'intfloat/e5-small-v2')


def build_soat_index_from_pdf(pdf_path: str, index_dir: str) -> str:
    """Create a FAISS index from a SOAT manual PDF. Returns the index directory used."""
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF no encontrado: {pdf_path}")

    os.makedirs(index_dir, exist_ok=True)

    reader = PdfReader(pdf_path)
    documents = []
    for page_num, page in enumerate(reader.pages, start=1):
        try:
            text = page.extract_text() or ""
        except Exception:
            text = ""
        text = text.strip()
        if not text:
            continue
        documents.append(
            Document(
                page_content=text,
                metadata={
                    "source": os.path.basename(pdf_path),
                    "page": page_num,
                },
            )
        )

    if not documents:
        raise ValueError("El PDF no contiene texto extraíble para indexar.")

    embeddings = get_hf_embeddings(EMBEDDINGS_MODEL)
    vs = FAISS.from_documents(documents, embeddings)
    vs.save_local(index_dir)
    return index_dir


def main():
    parser = argparse.ArgumentParser(description="Construir índice FAISS del Manual SOAT")
    parser.add_argument("--pdf", dest="pdf_path", default="Manual-SOAT-2025.pdf", help="Ruta al PDF del Manual SOAT (por defecto: Manual-SOAT-2025.pdf)")
    parser.add_argument("--out", dest="index_dir", default="soat_faiss", help="Directorio de salida del índice (por defecto: soat_faiss)")
    args = parser.parse_args()

    print(f"📄 PDF: {args.pdf_path}")
    print(f"📁 Índice: {args.index_dir}")
    try:
        built_dir = build_soat_index_from_pdf(args.pdf_path, args.index_dir)
        print(f"✅ Índice SOAT construido en: {built_dir}")
    except Exception as e:
        print(f"❌ Error construyendo índice: {e}")
        raise


if __name__ == "__main__":
    main() 
