import os,sys
from pandas import Timestamp,read_csv,read_excel
from glob import glob
from genson import SchemaBuilder
from langchain.chat_models import ChatOpenAI
from langchain.schema.messages import HumanMessage, AIMessage
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_community.document_loaders import UnstructuredXMLLoader
from langchain_community.vectorstores import ElasticsearchStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from shutil import rmtree
from langchain.document_loaders import JSONLoader
from elasticsearch import Elasticsearch
from PyPDF2 import PdfReader
from natsort import natsorted
from json import dump,load,dumps
from decouple import config
from doc2pdf import convert
from PyPDF2.errors import PdfReadError
import pdfplumber
from docx2txt import process
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader
from fpdf import FPDF
from spire.presentation import Presentation
from spire.presentation import FileFormat
from pypandoc import convert_file
import base64
from PIL import Image


sys.path.append(os.path.join(os.path.dirname(__file__))+'/img_atlog/')
from img_atlog import LogUtils
logger = LogUtils.getRootLogger()
os.environ["OPENAI_API_KEY"] = config('SECRET_KEY')


es= Elasticsearch(config('ES_URL'),request_timeout=300,retry_on_timeout=True)

embeddings=OpenAIEmbeddings()


logger.info('index started')

def file_format_change(sharepoint_localpath):
    for filename in os.listdir(sharepoint_localpath):
        # Check if the file extension is .PDF
        if filename.endswith('.PNG'):
             # Define the old and new file paths
             old_file = os.path.join(sharepoint_localpath, filename)
             new_file = os.path.join(sharepoint_localpath, filename.replace('.PNG', '.png'))
             # Rename the file
             os.rename(old_file, new_file)
        elif filename.endswith('.JPEG'):
             # Define the old and new file paths
             old_file = os.path.join(sharepoint_localpath, filename)
             new_file = os.path.join(sharepoint_localpath, filename.replace('.JPEG', '.jpeg'))
             # Rename the file
             os.rename(old_file, new_file)
        elif filename.endswith('.BMP'):
             # Define the old and new file paths
             old_file = os.path.join(sharepoint_localpath, filename)
             new_file = os.path.join(sharepoint_localpath, filename.replace('.BMP', '.bmp'))
             # Rename the file
             os.rename(old_file, new_file)
        elif filename.endswith('.JPG'):
             # Define the old and new file paths
             old_file = os.path.join(sharepoint_localpath, filename)
             new_file = os.path.join(sharepoint_localpath, filename.replace('.JPG', '.jpg'))
             # Rename the file
             os.rename(old_file, new_file)
             
def convert_text_and_images_to_pdf(input_file, temp_dir, output_file):
    text = process(input_file, temp_dir)
    c = canvas.Canvas(output_file, pagesize=letter)
    width, height = letter
    y = height - 40  # Start 40 points from the top
    
    for line in text.split('\n'):
        if line.startswith("[IMAGE:"):
            image_path = os.path.join(temp_dir, line.strip()[7:-1])
            if os.path.exists(image_path):
                c.drawImage(ImageReader(image_path), 30, y - 100, width=200, height=100)
                y -= 120  # Adjust space for image
        else:
            c.drawString(30, y, line)
            y -= 14  # Move down 14 points for each line
        if y < 40:  # 40 points margin from the bottom
            c.showPage()
            y = height - 40
    
    c.save()
    rmtree(temp_dir)

def convert_txt_to_pdf(txt_path):
    pdf = FPDF()   
    pdf.add_page()
    pdf.set_font("Arial", size=15)
    
    # Try opening the file with utf-8 encoding first
    try:
        with open(txt_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
    except UnicodeDecodeError:
        # If a UnicodeDecodeError occurs, open the file with a different encoding
        with open(txt_path, "r", encoding="latin-1") as f:
            lines = f.readlines()
    
    # Insert the texts in pdf
    for line in lines:
        pdf.cell(200, 10, txt=line.strip(), ln=1, align='C')
    
    # Save the pdf with name .pdf
    txt_path_without_ext = os.path.splitext(txt_path)[0]
    pdf.output(f"{txt_path_without_ext}_txt.pdf")

def save_text_to_pdf(text, output_path):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_auto_page_break(auto=True, margin=15)
    
    # Path to the DejaVuSans.ttf file
    font_path = os.path.join(os.path.dirname(__file__))+"/DejaVuSans.ttf"
    
    pdf.add_font("DejaVu", fname=font_path, uni=True)
    pdf.set_font("DejaVu", size=12)

    lines = text.split("\n")
    for line in lines:
        pdf.multi_cell(0, 10, line)

    pdf.output(output_path)
    
    
def pptx_to_pdf(pptx_path):
    try:
        presentation = Presentation()
        # Load a PowerPoint presentation in PPTX format
        presentation.LoadFromFile(pptx_path) 
        pdf_path = os.path.splitext(pptx_path)[0]
        # Convert the presentation to PDF format
        presentation.SaveToFile(f"{pdf_path}_pptx.pdf", FileFormat.PDF)
        presentation.Dispose()
        print(f"Successfully converted {pptx_path} to {pdf_path}")
        return True
    except Exception as e:
        print(f"Failed to convert {pptx_path} to PDF. Error: {e}")   
        return False
    
def convert_md_to_pdf(md_file_path, output_file_path):
    output = convert_file(md_file_path, 'pdf', outputfile=output_file_path,extra_args=['--pdf-engine=xelatex'])
    assert output == ""
    print(f"Converted {md_file_path} to {output_file_path}")        
        

def readfiles_pdf(path):
    os.chdir(path)
    pdfs = []
    for file in glob("*.pdf"):
        try:
            # print(f"Reading {file}")
            reader = PdfReader(file)
            if len(reader.pages) == 0:
                raise PdfReadError("Empty file")
            pdfs.append(file)
        except PdfReadError as e:
            print(f"Error reading {file}: {e}")
            logger.exception('Error reading %s' % e)
        except Exception as e:
            print(f"An unexpected error occurred with {file}: {e}")
    return pdfs

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
    
    
def resize_and_convert_image(image_path):
    with Image.open(image_path) as img:
        # Resize the image if it exceeds a certain size (e.g., width or height > 1024)
        max_size = (1024, 1024)
        img.thumbnail(max_size, Image.LANCZOS)

        # Convert BMP images to JPEG
        if img.format == 'BMP':
            img = img.convert('RGB')  # Convert to RGB first
            converted_path = image_path.replace('.bmp', '.jpeg')
            img.save(converted_path, format='JPEG')
            return converted_path
        else:
            # Save the resized image to a temporary file in the same format
            converted_path = image_path.replace(f".{img.format.lower()}", f".resized.{img.format.lower()}")
            img.save(converted_path)
            return converted_path    
        
def convert_docx_to_pdf(docx_path, pdf_path):
    try:
        convert_file(docx_path, 'pdf', outputfile=pdf_path, extra_args=['--pdf-engine=xelatex'])
        print(f"Successfully converted {docx_path} to {pdf_path}")
        return True
    except Exception as e:
        print(f"Failed to convert {docx_path} to PDF. Error: {e}")
        return False
        
def row_to_json(row, column_names):
    """
    Convert a row to a JSON object with special handling for Timestamp objects.
    """
    data = {}
    for col_name in column_names:
        value = row[col_name]
        if isinstance(value, Timestamp):
            value = value.isoformat()  # Convert Timestamp to ISO format string
        data[col_name] = value
    return data

def pdf_to_json_new(input_pdf_path, output_json_path):
    # Initialize a dictionary to hold the pages' content
    pdf_content = {"pages": []}

    # Open the PDF file with pdfplumber
    with pdfplumber.open(input_pdf_path) as pdf:
        # Iterate through each page and extract text
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text()
            pdf_content["pages"].append({
                'text': text
            })

    # Write the content to a JSON file
    with open(output_json_path, 'w', encoding='utf-8') as json_file:
        dump(pdf_content, json_file, ensure_ascii=False, indent=4)
        

def excel_to_json(filename, column_names, output_folder, bot_file_name):
    """
    Convert each row with specific columns in an Excel file to individual JSON files.
    """
    # print(filename)
    df = read_excel(filename)
    for index, row in df.iterrows():
        json_data = row_to_json(row, column_names)
        json_data = {'content': [json_data]}
        output_filename = f"{output_folder}/{bot_file_name.split('.xlsx')[0]}_{index}.json"
        # print(output_filename)
        with open(output_filename, 'w') as f:
            dump(json_data, f, indent=4)
    
    df_header = df.head(0)  # Extracting only the header row
    df_header.to_excel(filename, index=False) 

def generate_schema_from_json(file_path):
    with open(file_path, 'r') as file:
        # Load the JSON data
        data = load(file)
 
    # Initialize the schema builder
    builder = SchemaBuilder()
 
    # Add the JSON data to the schema builder
    builder.add_object(data)
 
    # Get the generated schema
    schema = builder.to_schema()
 
    # Convert the schema dictionary to a JSON string
    schema_string = dumps(schema, indent=2)
    return schema_string
 

def load_json_directory(directory_path,success_file_list):
    documents = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            try:
                file_path = os.path.join(directory_path, filename)
                # Generate schema for the JSON file
                schema_string = generate_schema_from_json(file_path)
                # Load the JSON file using JSONLoader
                loader = JSONLoader(file_path, jq_schema=schema_string, text_content=False)
                docs = loader.load()
                # Add loaded documents to the list
                documents.extend(docs)
                success_file_list.append(filename)

            except Exception as e:
                logger.info('json conversation failed %s' % filename)
                print(e)

    return documents

def batch_convert_docx_to_pdf(input_directory, output_directory,success_file_list,temp_dir):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for filename in os.listdir(input_directory):
        if filename.endswith('.docx'):
            input_file_path = os.path.join(input_directory, filename)
            output_file_name = os.path.splitext(filename)[0] + '.pdf'
            output_file_path = os.path.join(output_directory, output_file_name)
            conv_status=convert_docx_to_pdf(input_file_path, output_file_path)
            if not conv_status:
                    logger.info('second docx conversation satisfied %s' % input_file_path)
                    try:
                        convert_text_and_images_to_pdf(input_file_path, temp_dir, output_file_path)
                        success_file_list.append(filename)
                    except Exception as ex: 
                        exc_type, exc_value, exc_traceback = sys.exc_info()
                        logger.exception('Exception  %s' % exc_value)
                        logger.info('docx to pdf conversion failed %s' % filename)           
            else:
                logger.info('first docx conversation satisfied %s' % input_file_path)
                success_file_list.append(filename)

         
def add_session_id_to_documents(documents, session_id):
    updated_documents = []
    for doc in documents:
        # Assuming 'metadata' is an attribute or can be accessed/modified directly
        if not hasattr(doc, 'metadata'):
            doc.metadata = {}
        doc.metadata['session_id'] = session_id
        updated_documents.append(doc)
    return updated_documents
   
def Image_indexing(sharepoint_localpath,indexname,session_id):
    try :

        success_file_list=[]
        status_code=1
        org_filelist=os.listdir(sharepoint_localpath)

        ############################### image to pdf #####################

        # Initialize your ChatOpenAI instance with your OpenAI API key
        chain = ChatOpenAI(openai_api_key=config('SECRET_KEY'), model="gpt-4o", max_tokens=1024)


        # Use glob to get all image files (jpg, jpeg, png, bmp)
        image_files = glob(sharepoint_localpath + '/*.jpg') + glob(sharepoint_localpath + '/*.jpeg') + glob(sharepoint_localpath + '/*.png') + glob(sharepoint_localpath + '/*.bmp')

        # Use glob to get all supported image files (jpg, jpeg, png, bmp)
        supported_formats = ['.jpg', '.jpeg', '.png', '.bmp']
        image_files = []
        for ext in supported_formats:
            image_files.extend(glob(os.path.join(sharepoint_localpath, f'*{ext}')))

        # Iterate through each image file
        for image_file in image_files:
            # Resize and convert the image if necessary
            processed_image_path = resize_and_convert_image(image_file)

            # Encode the image
            image = encode_image(processed_image_path)

            # Determine the image MIME type
            if processed_image_path.endswith('.jpeg') or processed_image_path.endswith('.jpg'):
                mime_type = 'image/jpeg'
            elif processed_image_path.endswith('.png'):
                mime_type = 'image/png'
            else:
                mime_type = 'image/jpeg'  # Default to JPEG if BMP was converted

            # Invoke your chatbot with the message containing the image
            try:
                msg = chain.invoke([
                    AIMessage(
                        content="You are a useful bot that is especially good at OCR from images"
                    ),
                    HumanMessage(
                        content=[
                            {"type": "text", "text": "You are a useful bot that is especially good at OCR from images"},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:{mime_type};base64,{image}"
                                },
                            },
                        ]
                    )
                ])
                # Print the response content
                #print(msg.content)
                # Save the text to a PDF file
                pdf_filename = os.path.splitext(os.path.basename(image_file))[0] +"_"+image_file.split('.')[-1] +".pdf"
                pdf_path = os.path.join(sharepoint_localpath, pdf_filename)
                save_text_to_pdf(msg.content, pdf_path)
                success_file_list.append(image_file.split('/')[-1])
            except Exception as e:   
                logger.info(f"Failed to process image {image_file}: {e}")    
                status_code=0
            
                
        ############################### pdf to json ####################
        img_json_path=os.path.join(os.path.dirname(__file__))+'/img_json_conv'
        # img_json_path='/home/linux/projects/may_21_24/QS_project/march_19/allibot_part2/phase_2_indexing/pdf_json_conv'
        pdfs=readfiles_pdf(sharepoint_localpath)           
        os.makedirs(img_json_path, exist_ok=True)
        pdf_success=[]
        for pdf_file in pdfs:
            pdf_path = os.path.join(sharepoint_localpath, pdf_file)
            
            # Create a PDF reader object
            reader = PdfReader(pdf_path)
            
            # Dictionary to store the pages' text
            pdf_data = {"pages": []}
            try:

            # Loop through each page and extract text
                for page_num in range(len(reader.pages)):
                    page = reader.pages[page_num]
                    text = page.extract_text()
                    pdf_data["pages"].append(text)
                
                # Convert dictionary to JSON string
                pdf_json = dumps(pdf_data, indent=4)
                
                # Save the JSON string to a file in the output directory
                json_filename = pdf_file.replace('.pdf', '.json')
                json_path = os.path.join(img_json_path, json_filename)
                
                with open(json_path, 'w') as json_file:
                    json_file.write(pdf_json)
                logger.info('pypdf satisfied %s' % pdf_file)
                pdf_success.append(pdf_file.split('.')[0])
            except Exception as e:
                try:
                    json_filename = pdf_file.replace('.pdf', '.json')
                    output_json_path = os.path.join(img_json_path, json_filename)
                    pdf_to_json_new(pdf_path, output_json_path)
                    logger.info('pdfplumber satisfied %s' % pdf_file)
                    pdf_success.append(pdf_file.split('.')[0])
                except Exception as e:                    
                    logger.info('exception occured pdf %s' % pdf_file)
                    logger.exception('Exception occured due to %s' % e)
                    status_code=0
            
            # print(f"PDF data has been written to {json_path}")

        for i in natsorted(glob(img_json_path+'/*.json')):
            loader = JSONLoader(file_path=i, jq_schema=".pages[]", text_content=False)
            json_documents = loader.load()
            if json_documents:
                json_documents_with_session_id = add_session_id_to_documents(json_documents, session_id)
                os.environ["OPENAI_API_KEY"] = config('SECRET_KEY')
                embeddings=OpenAIEmbeddings()
                db = ElasticsearchStore.from_documents(
                  documents=json_documents,
                  es_connection=es,
                  index_name=indexname,
                  embedding=embeddings,
                  strategy=ElasticsearchStore.ExactRetrievalStrategy())


        logger.info('pdf index added')
            
        # print('success_file_list',success_file_list)
        logger.info('success_file_list %s' % success_file_list)
        if len(success_file_list)!=len(org_filelist):
            status_code=0
        rmtree(img_json_path)
        logger.info('files removed')
        return status_code
    except Exception as e:
        logger.exception('Exception occured due to %s' % e)
        return 0
# rmtree(sharepoint_localpath)
#rmtree(img_json_path)
#rmtree(bot_json_path)
