import re
import fitz  # PyMuPDF
import openai
import os
import time
from concurrent.futures import ThreadPoolExecutor

# Import OCR libraries
import pytesseract
from PIL import Image
import io

class ImprovedDocumentProcessor:
    def __init__(self, api_key,
             #model="gpt-3.5-turbo",  # Comment out this line
             #model="gpt-4.5-preview",  # Use GPT-4.5 Preview
             model="gpt-4o",  # Use GPT-4o
             max_tokens=4000, overlap=200):
        self.api_key = api_key
        self.model = model
        self.max_tokens = max_tokens
        self.overlap = overlap
        
    def extract_text_from_pdf(self, file_path):
        """Extract text from PDF file with OCR fallback for scanned documents"""
        try:
            doc = fitz.open(file_path)
            text = ""
            
            # First try normal text extraction
            for page_num, page in enumerate(doc):
                page_text = page.get_text()
                
                # If page has text content, use it
                if page_text.strip():
                    text += page_text
                # If page has no text (likely an image/scan), use OCR
                else:
                    print(f"Using OCR for page {page_num+1} (no text found in PDF)")
                    try:
                        # Get the page as a pixmap/image
                        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # Scale up for better OCR
                        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                        
                        # Apply OCR
                        page_text = pytesseract.image_to_string(img)
                        text += page_text + "\n\n"
                    except Exception as ocr_error:
                        print(f"OCR error on page {page_num+1}: {ocr_error}")
                        text += f"[OCR FAILED FOR PAGE {page_num+1}]\n\n"
            
            return text
        except Exception as e:
            print(f"Error extracting text from PDF: {e}")
            return None
            
    def extract_text_from_image(self, file_path):
        """Extract text from image files using OCR"""
        try:
            img = Image.open(file_path)
            text = pytesseract.image_to_string(img)
            return text
        except Exception as e:
            print(f"Error extracting text from image: {e}")
            return None

    def extract_text_from_docx(self, file_path):
        """Extract text from Word documents with fallback options"""
        # Check if this is a legacy .doc file (not .docx)
        if file_path.lower().endswith('.doc') and not file_path.lower().endswith('.docx'):
            try:
                import subprocess
                # Log to file for debugging
                with open('/var/log/epolaw/doc_extraction.log', 'a') as log:
                    log.write(f"[{__import__('datetime').datetime.now()}] Detected legacy .doc file: {file_path}\n")

                print("Detected legacy .doc file - using antiword")
                result = subprocess.run(['/usr/bin/antiword', file_path],
                                      capture_output=True,
                                      text=True,
                                      timeout=60)

                with open('/var/log/epolaw/doc_extraction.log', 'a') as log:
                    log.write(f"[{__import__('datetime').datetime.now()}] antiword return code: {result.returncode}\n")
                    log.write(f"[{__import__('datetime').datetime.now()}] stdout length: {len(result.stdout)}\n")
                    log.write(f"[{__import__('datetime').datetime.now()}] stderr: {result.stderr}\n")

                if result.returncode == 0 and result.stdout.strip():
                    print(f"Successfully extracted text using antiword: {len(result.stdout)} characters")
                    return result.stdout
                else:
                    print(f"antiword failed with return code {result.returncode}")
                    if result.stderr:
                        print(f"antiword stderr: {result.stderr}")
                    # Return None explicitly for .doc files if antiword fails
                    return None
            except Exception as e:
                with open('/var/log/epolaw/doc_extraction.log', 'a') as log:
                    log.write(f"[{__import__('datetime').datetime.now()}] antiword exception: {e}\n")
                print(f"antiword extraction failed: {e}")
                return None

        # For .docx files, try mammoth first (best formatting preservation)
        try:
            import mammoth
            with open(file_path, "rb") as docx_file:
                result = mammoth.extract_raw_text(docx_file)
                if result.value and len(result.value.strip()) > 0:
                    print(f"Successfully extracted text using mammoth: {len(result.value)} characters")
                    if result.messages:
                        print(f"Mammoth conversion messages: {result.messages}")
                    return result.value
        except Exception as e:
            print(f"Mammoth failed: {e}")
        
        try:
            # Fallback to docx2txt
            import docx2txt
            text = docx2txt.process(file_path)
            if text and len(text.strip()) > 0:
                print(f"Successfully extracted text using docx2txt: {len(text)} characters")
                return text
        except Exception as e:
            print(f"docx2txt failed: {e}")
        
        try:
            # Final fallback to python-docx
            import docx
            doc = docx.Document(file_path)
            text = ""
            
            # Extract paragraph text
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
            
            # Extract table text
            for table in doc.tables:
                for row in table.rows:
                    for cell in row.cells:
                        text += cell.text + " "
                    text += "\n"
            
            if text.strip():
                print(f"Successfully extracted text using python-docx: {len(text)} characters")
                return text
            else:
                print("python-docx extracted empty text")
                return None
        except Exception as e:
            print(f"python-docx failed: {e}")
            return None
            
    def extract_text_from_file(self, file_path):
        """Extract text from PDF, TXT, Word, or image files"""
        lower_path = file_path.lower()
        if lower_path.endswith('.pdf'):
            return self.extract_text_from_pdf(file_path)
        elif lower_path.endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                return f.read()
        elif lower_path.endswith(('.docx', '.doc')):
            return self.extract_text_from_docx(file_path)
        elif lower_path.endswith(('.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp')):
            return self.extract_text_from_image(file_path)
        else:
            print(f"Unsupported file format: {file_path}")
            return None
            
    def direct_analyze_document(self, file_path, perspective="neutral"):
        """Analyze small documents directly without chunking"""
        try:
            # Extract text
            text = self.extract_text_from_file(file_path)
            if not text:
                return {
                    "final_analysis": "Failed to extract text from document",
                    "section_analyses": [],
                    "num_sections": 0
                }
                
            print(f"Attempting direct analysis of document with {len(text)} characters")
            
            # Create system prompt based on perspective
            if perspective == "prosecutor":
                system_prompt = """You are a prosecutor analyzing a legal document. 
                Focus on facts and legal elements that support the prosecution's case.
                Identify potential violations, strong evidence, and applicable precedents.
                Be objective but highlight aspects favorable to prosecution."""
            elif perspective == "defense":
                system_prompt = """You are a defense attorney analyzing a legal document.
                Focus on facts and legal elements that support the defense's case.
                Identify weaknesses in prosecution evidence, potential defenses, and applicable precedents.
                Be objective but highlight aspects favorable to defense."""
            else:  # neutral
                system_prompt = """You are a neutral legal expert analyzing a legal document.
                Provide a balanced assessment of the legal issues presented.
                Identify key facts, legal questions, and applicable precedents from both perspectives."""
            
            # Add OCR instructions if needed
            system_prompt += """
            If there are OCR errors in the text, please do your best to interpret the intended meaning
            based on context."""
            
            # New API format for openai >= 1.0.0
            client = openai.OpenAI(api_key=self.api_key, timeout=60.0)
            response = client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"Please analyze this legal document:\n\n{text}"}
                ],
                max_tokens=2000,
                temperature=0.2
            )
            
            analysis = response.choices[0].message.content
            return {
                "final_analysis": analysis,
                "section_analyses": [analysis],
                "num_sections": 1
            }
        except Exception as e:
            print(f"Error in direct analysis: {e}")
            return {
                "final_analysis": f"Error in direct analysis: {str(e)}",
                "section_analyses": [],
                "num_sections": 0
            }
    
    def process_document(self, file_path, perspective="neutral"):
        """Process a document based on its size"""
        try:
            # Get file size
            file_size = os.path.getsize(file_path)
            print(f"Processing file: {file_path}, Size: {file_size} bytes")
            
            # Use direct method for smaller files
            if file_size < 300 * 1024:  # Less than 300KB
                print(f"Using direct analysis for small file ({file_size} bytes)")
                return self.direct_analyze_document(file_path, perspective)
            else:
                # For larger files, we'd normally do chunking
                # But since we're having issues, let's just use direct analysis with truncation
                print(f"Using direct analysis with truncation for larger file ({file_size} bytes)")
                
                # Extract text
                text = self.extract_text_from_file(file_path)
                if not text:
                    return {
                        "final_analysis": "Failed to extract text from document",
                        "section_analyses": [],
                        "num_sections": 0
                    }
                
                # Truncate text to a manageable size
                max_chars = 50000  # Adjust as needed
                if len(text) > max_chars:
                    print(f"Truncating text from {len(text)} to {max_chars} characters")
                    text = text[:max_chars] + "\n\n[Document truncated due to length]"
                
                # Then analyze directly
                if perspective == "prosecutor":
                    system_prompt = """You are a prosecutor analyzing a legal document.
                    Focus on facts and legal elements that support the prosecution's case.
                    Identify potential violations, strong evidence, and applicable precedents.
                    Be objective but highlight aspects favorable to prosecution."""
                elif perspective == "defense":
                    system_prompt = """You are a defense attorney analyzing a legal document.
                    Focus on facts and legal elements that support the defense's case.
                    Identify weaknesses in prosecution evidence, potential defenses, and applicable precedents.
                    Be objective but highlight aspects favorable to defense."""
                else:  # neutral
                    system_prompt = """You are a neutral legal expert analyzing a legal document.
                    Provide a balanced assessment of the legal issues presented.
                    Identify key facts, legal questions, and applicable precedents from both perspectives."""
                
                # Add note about truncation and OCR
                system_prompt += """
                Note that this document may have been truncated due to length.
                If there are OCR errors in the text, please do your best to interpret the intended meaning
                based on context."""
                
                # New API format for openai >= 1.0.0
                client = openai.OpenAI(api_key=self.api_key, timeout=60.0)
                response = client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": f"Please analyze this legal document:\n\n{text}"}
                    ],
                    max_tokens=2000,
                    temperature=0.2
                )
                
                analysis = response.choices[0].message.content
                return {
                    "final_analysis": analysis,
                    "section_analyses": [analysis],
                    "num_sections": 1
                }
        except Exception as e:
            import traceback
            print(f"Error in process_document: {e}")
            print(traceback.format_exc())
            return {
                "final_analysis": f"Error processing document: {str(e)}",
                "section_analyses": [],
                "num_sections": 0
            }
