
    )i'                     :   U d dl mZ d dlZd dlZd dlZd dlZd dlmZmZm	Z	m
Z
 d dlZd dlmZ d dlmZ  ej                   e      Z ej&                  dd      Z ej&                  dd      Zdae
e   ed	<   dae
e   ed
<   dZdZdefdZdefdZdededede
e   fdZdededefdZdededefdZ dededdfdZ!dededee   fdZ"dedee   fdZ#d ed!ed"edefd#Z$d$ee   d ed"edee	eef      fd%Z%d&e	eef   d ed'efd(Z&d,d)ee	eef      d ed'ed*e'fd+Z(y)-    )PathN)AnyListDictOptional)Mistral)OpenAIMISTRAL_API_KEY OPENAI_API_KEY_mistral_client_openai_clientu  You are a precise data extraction system designed to parse OCR-derived markdown text containing multiple-choice questions (MCQs) and image references. Your objective is to produce a strictly structured JSON array where each element represents one question. You must not include any text, markdown, or commentary outside the JSON.

Each extracted object must follow this exact schema:
{
  "Question": string or null,             # the question text
  "Options": {                            # the four options if available
    "A": string,
    "B": string,
    "C": string,
    "D": string
  } or null,                              # if options are missing, set to null
  "FigureRefs": [                         # filenames of any referenced images
    "img-5.jpeg", "img-6.jpeg"            # or [] if none
  ]
}

Parsing rules:
- Identify each question block accurately, even if spacing or numbering is inconsistent.
- Extract the full question text excluding option lines and image markdown.
- Detect image references from markdown patterns like ![...](filename.jpeg) and list only the filename(s) in 'FigureRefs'.
- If a question contains an image but no options, set 'Question' and 'Options' to null, and include the image filename in 'FigureRefs'.
- Never fabricate or guess content not present in the text.
- Preserve all punctuation and numeric tokens exactly as seen.
- The final output must be a valid JSON array with no comments, markdown, or explanatory text.

Output only the JSON array — nothing else.u  You are an expert linguistic normalizer and validator for OCR-derived multiple-choice questions (MCQs).
Your task is to analyze the given MCQ — consisting of a "Question" and its "Options" — and determine whether any textual correction is required.

Evaluation and Correction Rules:

1. If both the question and all options are already clear, readable, and semantically correct, return exactly null (without quotes, spaces, or JSON formatting).

2. Otherwise, apply precise corrections only where necessary, following these principles:
   - Correct OCR artefacts, spacing errors, or punctuation issues.
   - Fix clear grammatical mistakes without altering meaning.
   - Preserve all numbers, symbols, and domain-specific notation.
   - Keep the same option labels ("A", "B", "C", "D") and their logical order.
   - Do not invent, rephrase, or guess missing content.
   - If some text is incomplete or ambiguous, leave it unchanged.

Output Requirements:

- If no correction is needed → return: null
- If corrections are made → return a valid JSON object strictly in this form:
{
  "Question": "corrected question text",
  "Options": {
    "A": "corrected text",
    "B": "corrected text",
    "C": "corrected text",
    "D": "corrected text"
  }
}

Important:
- Do not wrap the JSON in code fences or explanations.
- Output only the JSON object or null.
- The output must be valid JSON when applicable.
returnc                  :    t         t        t              a t         S N)api_key)r   r   r
        3C:\xampp\htdocs\eduruby\utils\pipeline_functions.pyget_mistral_clientr   W   s    !/:r   c                  :    t         t        t              a t         S r   )r   r	   r   r   r   r   get_openai_clientr   ]   s    7r   
target_dirimage_idbase64_stringc                 T   	 |j                  dd      \  }}	 t	        j
                  |      }| j                  dd       | |z  }|j                  |       |S # t        $ r t        j                  d|       Y y w xY w# t        $ r!}t        j                  d||       Y d }~y d }~ww xY w)N,   z%Unexpected base64 image format for %sTparentsexist_okzFailed to write image %s: %s)
split
ValueErrorloggererrorbase64	b64decodemkdirwrite_bytes	Exception	exception)r   r   r   headerencodedbpes           r   write_image_from_base64r1   c   s    '--c15W%5!	a  <hG  71Es(   A ?A= A:9A:=	B'B""B'pdf_pathmistralc                     t         j                  d|        t        | d      5 }|j                  j	                  | j
                  |dd      }d d d        |S # 1 sw Y   S xY w)NzUploading pdf to Mistral: %srb)	file_namecontentocr)filepurpose)r$   infoopenfilesuploadname)r2   r3   fhr>   s       r   upload_pdf_for_ocrrA   s   sV    
KK.9	h	%%SU+V`e%f 
M 
Ms   +AA"uploaded_fileclientc           
      V   | j                   }t        j                  d|       |j                  j	                  |      }t        j                  d       |j
                  j                  dd|j                  dd      }t        j                  d	t        t        |d
g                    |S )z:Request OCR processing and return the OCR response object.z$Retrieving signed URL for file id=%s)file_idzCalling OCR modelzmistral-ocr-latestdocument_url)typerF   T)modeldocumentinclude_image_base64u   OCR completed — pages: %spages)
idr$   r;   r=   get_signed_urlr8   processurllengetattr)rB   rC   rE   
signed_urlocr_responses        r   run_ocr_on_uploaded_filerT   y   s    G
KK6@,,W,=J
KK#$::%%"(*..I! & L
 KK-s7<RT3U/VWr   rS   out_pathc                     | j                   D cg c]  }t        |dd       }}dj                  |      }|j                  j	                  dd       |j                  |d       y c c}w )Nmarkdownr   

---

Tr   zutf-8)encoding)rK   rQ   joinparentr(   
write_text)rS   rU   r/   page_markdownsocr_texts        r   save_ocr_markdownr_      sg    :F:L:LM:LQgaR0:LNM!!.1HOO$673 Ns   A'c                 "   g }|j                  dd       | j                  D ]j  }t        |dg       xs g D ]T  }t        |dd       xs dt        |      dz    d}t        |dd       }|s4t	        |||      }|sD|j                  |       V l |S )	NTr   imagesrL   zimg-r   z.binimage_base64)r(   rK   rQ   rP   r1   append)rS   r   writtenpageimgimg_idb64r/   s           r   extract_images_from_ocrri      s    GTD1""42.4"4CS$-L4GQ7Gt1LF#~t4C+JDgnnQ' 5 # Nr   md_textc                     | j                  d      D cg c]#  }|j                         s|j                         % c}S c c}w )NrX   )r"   strip)rj   r/   s     r   split_pages_from_markdownrm      s3    &}}];I;!qwwyAGGI;IIIs   ??openai_client	page_textsystem_promptc                    d| }	 | j                   j                  j                  dd|dd|dgddid	      }|j                  d   j                  j
                  j                         }	 t        j                  |      S # t        $ r. 	 t        j                  |      cY S # t        $ r	 d
|icY cY S w xY ww xY w# t        $ r-}t        j                  d|       dt        |      icY d }~S d }~ww xY w)Nz0Extract the MCQs from the following page text:

gpt-4o-minisystemroler7   userrG   json_objectr   rH   messagesresponse_formattemperatureraw_textzExtraction LLM error: %sr%   )chatcompletionscreatechoicesmessager7   rl   jsonloadsr*   demjson3decoder$   r+   str)rn   ro   rp   promptresponseresult_textr0   s          r   call_extraction_llmr      s    A)MF! %%1188'MBV`fDgh#]3	 9 
 &&q)1199??A	1::k** 	11{33 1"K001		1  !3Q7Q  !s`   A$B; ,B 	B8B"B8 B; "B4/B81B; 3B44B88B; ;	C1"C,&C1,C1rK   c                     g }t        | d      D ]<  \  }}t        j                  d|       t        |||      }|j	                  ||d       > |S )Nr   )startzLLM extract page %s)re   r7   )	enumerater$   r;   r   rc   )rK   rn   rp   resultsidxr/   page_results          r   extract_questions_from_pagesr      sQ    GE+Q)3/)-MJ<= , Nr   question_entryvalidation_promptc           	         | j                  d      r| j                  d      s| dfS | d   | d   d}	 |j                  j                  j                  dd|ddt	        j
                  |d	      dgd
did      }|j                  d   j                  j                  }|!|j                         j                         dk(  r| dfS t	        j                  |      }| j                         }|j                  d|d         |d<   t        |j                  d      t              r|d   |d<   |dfS # t        $ r$}t         j#                  d|       | dfcY d }~S d }~ww xY w)NQuestionOptionsF)r   r   rr   rs   rt   rv   )ensure_asciirG   rw   r   rx   nullTzValidation LLM error: %s)getr}   r~   r   r   dumpsr   r   r7   rl   lowerr   copy
isinstancedictr*   r$   r+   )	r   rn   r   payloadr   raw	correctedoutr0   s	            r   validate_question_via_openair      sq   j)1C1CI1Nu$$)*5.QZB[\G% %%1188&1BCF]a]g]gho  D  ^E  EF  G#M2	 9 
 q!))11;#))+++-7!5((JJsO	!!##--
C
ODJimmI.5&y1C	NDy %3Q7u$$%s%   BD- A'D- -	E6EEE	questionsmax_workersc           
         ddl m}m} d gt        |       z  }fd}d} ||      5 }	t	        |       D 
cg c]  \  }
}|	j                  ||
|       }}
} ||      D ]%  }|j                         \  }\  }}|||<   |s!|dz  }' 	 d d d        t        j                  d|       |D cg c]  }||	 c}S c c}}
w # 1 sw Y   8xY wc c}w )Nr   )ThreadPoolExecutoras_completedc                 "    | t        |      fS )N)r   )r   qrn   r   s     r   _wz*process_questions_concurrently.<locals>._w   s    0MCTUUUr   )r   r   zValidation done: %d changed)	concurrent.futuresr   r   rP   r   submitresultr$   r;   )r   rn   r   r   r   r   refinedr   changedexir   futuresfutr   entrychanged_flagrs    ``               r   process_questions_concurrentlyr      s    Cfs9~%GVG		43<Y3GH3G41a299RA&3GH(C),&C&% GCLW\W ) 
5 KK-w70w!!-Aw00 I 
5	4 1s.   CB<+CC-C5C<CC)   ))pathlibr   osr   loggingr&   typingr   r   r   r   r   	mistralair   openair	   	getLogger__name__r$   getenvr
   r   r   __annotations__r   SYSTEM_PROMPT_EXTRACTVALIDATION_PROMPTr   r   r   r1   rA   rT   r_   ri   rm   r   r   r   intr   r   r   r   <module>r      s    	    , ,   			8	$
 "))-r2+R0 &*'" )#'  '3 @9 >G 6   C T\]aTb    C C  S 4C 44 4D 4
# 
4 
DJ 
Js JtCy J!v !# !c !VY !.S	 & Y\ aefjknpsksftau %c3h %PV %kn %21d4S>.B 1SY 1nq 1  AD 1r   