import json
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema import Document
from text_cleaner import TextCleaner

class DocumentProcessor:
    """Clase base para procesar documentos."""

    def process(self, file_path: str) -> list[Document]:
        raise NotImplementedError("Este método debe ser implementado por las subclases.")

class JSONDocumentProcessor(DocumentProcessor):
    """Procesa documentos JSON."""

    def process(self, file_path: str) -> list[Document]:
        with open(file_path, 'r', encoding='utf-8') as file:
            json_data = json.load(file)

        documents = []
        if isinstance(json_data, list):
            for item in json_data:
                clean_content = TextCleaner.clean_text(json.dumps(item, ensure_ascii=False))
                documents.append(Document(page_content=clean_content, metadata={"source": file_path}))
        else:
            clean_content = TextCleaner.clean_text(json.dumps(json_data, ensure_ascii=False))
            documents.append(Document(page_content=clean_content, metadata={"source": file_path}))

        return documents

class PDFDocumentProcessor(DocumentProcessor):
    """Procesa documentos PDF."""

    def process(self, file_path: str) -> list[Document]:
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        for doc in documents:
            doc.page_content = TextCleaner.clean_text(doc.page_content)
        return documents

class DocumentProcessorFactory:
    """Fábrica para obtener el procesador adecuado según el tipo de archivo."""

    @staticmethod
    def get_processor(file_extension: str) -> DocumentProcessor:
        if file_extension == 'json':
            return JSONDocumentProcessor()
        elif file_extension == 'pdf':
            return PDFDocumentProcessor()
        else:
            raise ValueError(f"Tipo de archivo no soportado: {file_extension}")

def process_folder(folder_path: str) -> list[Document]:
    """Procesa todos los archivos de una carpeta."""
    documents = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        file_extension = file_name.split('.')[-1].lower()
        try:
            processor = DocumentProcessorFactory.get_processor(file_extension)
            documents.extend(processor.process(file_path))
        except ValueError as e:
            print(f"Advertencia: {e}")  # No detiene la ejecución
    return documents
