
from genai import Client, Credentials
import datetime
import pytz
import logging
import json
import copy
from dotenv import load_dotenv
from genai.text.generation import CreateExecutionOptions
from genai.schema import (
    DecodingMethod,
    LengthPenalty,
    ModerationParameters,
    ModerationStigma,
    TextGenerationParameters,
    TextGenerationReturnOptions,
)

try:
    from tqdm.auto import tqdm
except ImportError:
    print("Please install tqdm to run this example.")
    raise

load_dotenv()
client = Client(credentials=Credentials.from_env())

logging.getLogger("bampy").setLevel(logging.DEBUG)
fh = logging.FileHandler('bampy.log')
fh.setLevel(logging.DEBUG)
logging.getLogger("bampy").addHandler(fh)

parameters = TextGenerationParameters(
    max_new_tokens=250,
    min_new_tokens=1,
    decoding_method=DecodingMethod.GREEDY,
    # length_penalty=LengthPenalty(start_index=5, decay_factor=1.5),
    return_options=TextGenerationReturnOptions(
        # if ordered is False, you can use return_options to retrieve the corresponding prompt
        input_text=True,
    ),
)

current_date = datetime.datetime.now(
        pytz.timezone("America/Los_Angeles")
    ).strftime("%B %d, %Y")

demo_evaluation_template = (
    "\ncorrect answer(s): {correct_answers}"
    "\nresponse: {response}"
    "\nOutput: comment: {comment}"
    "\nevaluation: {evaluation}"
)
evaluation_template = (
    "\ncorrect answer(s): {correct_answers}"
    "\nresponse: {response}"
    "\nOutput: comment: "
)



# relaxed evaluation of the model answers for prompt 1 and prompt 2
# the gold answers are based on only one context (context 1 or context)
# the model should provide only one ccorrect answers based on the given context

prefix_single = (
    "Please evaluate the response to a question under relaxed evaluation, where"
    " hallucinations, outdated information, and ill-formed answers are allowed,"
    " as long as the primary answer is accurate. Please credit the response"
    " only if it provides a confident and definitive answer, or the correct"
    " answer can be obviously inferred from the response. The primary or final"
    " answer when standing alone must be accurate. Any additional information"
    " that is provided must not contradict the primary answer or reshape one's"
    " perception of it. For answers that"
    " involve names of entities (e.g., people), complete names or commonly"
    " recognized names are expected. Regarding numerical answers, approximate"
    " numbers are generally not accepted unless explicitly included in the"
    " ground-truth answers. We accept ill-formed responses (including those in"
    " a non-English language), as well as hallucinated or outdated information"
    " that does not significantly impact the primary answer."
)

demo_examples_single = [
    {
        "question": "How old is the world's oldest verified living person?",
        "correct_answers": [
            "116 years old",
            "116",
            "nearly 117 years old",
        ],
        "response": (
            f"As of today {current_date}, the most up-to-date and relevant"
            " information regarding this query is as follows. The oldest"
            " verified living person is Maria Branyas Morera, who was born"
            " on March 4, 1907, making her 116 years old."
        ),
        "comment": (
            "This is a valid question. The primary answer in the response"
            " (116 years old) is accurate, and all the provided information"
            " is accurate and up-to-date. Thus, the response is credited."
        ),
        "evaluation": "correct",
    },
    {
        "question": (
            "Which language has the third largest population by number of"
            " native speakers?"
        ),
        "correct_answers": ["English"],
        "response": "1. Mandarin 2. Spanish 3. English",
        "comment": (
            "This is a valid question. Even though the response doesn't contain"
            " an explicit statement, the correct answer (English) can still be"
            " inferred from the response. Thus, the response is credited."
        ),
        "evaluation": "correct",
    },
    {
        "question": (
            "Has Virginia Woolf's novel about the Ramsay family entered the"
            " public domain in the United States?"
        ),
        "correct_answers": ["Yes"],
        "response": (
            "1927's To the Lighthouse is in the public domain in the United"
            " States. The final answer is To the Lighthouse."
        ),
        "comment": (
            "This is a valid question. While the primary information in the"
            " response (1927's To the Lighthouse is in the public domain in"
            " the United States) is accurate, the final answer when standing"
            ' alone is inaccurate (it should be "Yes" instead of "To the'
            ' Lighthouse"). Thus, the response is not credited.'
        ),
        "evaluation": "incorrect",
    },
    {
        "question": "Is Tesla's stock price above $250 a share?",
        "correct_answers": ["No"],
        "response": "No, it isn't. The stock price is currently at $257.",
        "comment": (
            "This is a valid question. While the primary answer in the"
            " response (No) is accurate, the additional information"
            " contradicts the primary answer (250). Thus,"
            " the response is not credited."
        ),
        "evaluation": "incorrect",
    },
    {
        "question": "Who founded Amazon?",
        "correct_answers": ["Jeff Bezos", "Jeffrey Preston Bezos"],
        "response": "100% of people said Jeff.",
        "comment": (
            "This is a valid question. Given that the question involves a"
            " person's name, a complete name or commonly recognized name is"
            ' expected ("Jeff Bezos" instead of "Jeff"). Thus, the response'
            " is not credited."
        ),
        "evaluation": "incorrect",
    },
    {
        "question": "How many books has Colleen Hoover published?",
        "correct_answers": ["26 books", "26"],
        "response": (
            f"As of today {current_date}, the most up-to-date and relevant"
            " information regarding this query is as follows. Colleen Hoover"
            " has published over 20 books and novellas."
        ),
        "comment": (
            "This is a valid question. For a numerical answer, an approximate"
            " value (over 20 books) is generally not accepted unless"
            " explicitly included in the correct answers. Thus, the response"
            " is not credited."
        ),
        "evaluation": "incorrect",
    },
    {
        "question": "What is the largest model size of GPT-3?",
        "correct_answers": [
            "175 billion parameters",
            "175B parameters",
            "175 billion",
            "175B",
        ],
        "response": (
            "175 billion parameters. It is a 175 billion parameter, 1.37TB,"
            " 137B token, 137B wordpiece, 137B BPE, 137B byte pair encoding,"
            " 137B BPE token, 137B BPE wordpiece, 137B BPE token, 137B BPE"
            " wordpiece"
        ),
        "comment": (
            "This is a valid question. Even though the response is ill-formed"
            " and contains repetitions, the primary answer in the response (175"
            " billion parameters) is still accurate. Thus, the response is"
            " credited."
        ),
        "evaluation": "correct",
    },
    {
        "question": "How many member nations exist in the United Nations?",
        "correct_answers": [
            "193",
            "193 members",
            "193 member states",
            "193 sovereign states",
        ],
        "response": (
            "As of my last update in April 2023, there were 193"
            " member nations in the United Nations. Please note that this"
            " information may have changed since then, so it is advisable to"
            " check the latest information from the United Nations or other"
            " reliable sources."
        ),
        "comment": (
            "This is a valid question. Even though the response fails to"
            " provide a current answer and instead relies on outdated "
            " information without confirming its continued accuracy, the"
            " primary answer in the response (193 member nations) is still"
            " accurate. Thus, the response is credited."
        ),
        "evaluation": "correct",
    },
    {
        "question": (
            "Who won the biggest single-tournament payday in tennis history?"
        ),
        "correct_answers": ["Novak Djokovic"],
        "response": (
            "The biggest single-tournament payday in tennis history was won"
            " by Novak Djokovic at the 2021 Australian Open. He received a"
            " prize money of AUD 2.75 million (approximately USD 2.1 million)"
            " for winning the men's singles title."
        ),
        "comment": (
            "This is a valid question. Even though the additional provided"
            " information is outdated or hallucinated (by winning the 2022 ATP"
            " Finals title, Djokovic earned a record $4,740,300, the highest"
            " payout for a tournament winner in tennis), the primary answer in"
            " the response (Novak Djokovic) is still accurate. Thus, the"
            " response is credited."
        ),
        "evaluation": "correct",
    },
    {
        "question": (
            "Who was the first football player to score one hundred"
            " international goals?"
        ),
        "correct_answers": ["Ali Daei"],
        "response": (
            "As of my last update in April 2023, Ali Daei, a"
            " former Iranian footballer, was the first player to score 100"
            " international goals."
        ),
        "comment": (
            "This is a valid question. The primary answer in the response"
            " (Ali Daei) is accurate. Although the knowledge cutoff date is"
            " in 2021, this fact is timeless because Ali Daei was the initial"
            " achiever of this milestone, and he will always remain the first)."
            " Thus, the response is credited."
        ),
        "evaluation": "correct",
    },
    {
        "question": (
            "Who is the latest winner of the Formula 1 world championship?"
        ),
        "correct_answers": ["Max Verstappen", "Max Emilian Verstappen"],
        "response": (
            "Max Verstappen won the 2023 Formula 1 world championship. He is"
            " a Dutch racing driver who competes in Formula One for Red Bull"
            " Racing. He is the son of former Formula One driver Jos"
            " Verstappen. Verstappen started his racing career in karting at"
            " the age of seven. He won the 2013 European Karting Championship"
            " and the 2013 World Karting Championship. In 2014, he moved to"
            " single-seater racing, competing in the European Formula Three"
            " Championship. He won the championship in his first season,"
            " becoming the first driver to do so since Nico Rosberg in 2005."
            " In 2015, Verstappen moved to Formula One, driving for Toro"
            " Rosso. He became the youngest driver to compete in Formula One"
            " at the age of 17. He scored his first points in Formula One at"
            " the 2015 Hungarian GrandPrix. In 2016, Verstappen moved to Red"
            " Bull Racing. He won his first race at the 2016 Spanish Grand"
            " Prix. He became the youngest driver to win a race in Formula"
            " One at theage of 18. Verstappen finished the 2016 season in"
            " third place in the drivers' championship. In 2017, Verstappen"
            " won four races and finished the season in second place in the"
            " drivers' championship. In 2018, Verstappen won seven races and"
            " finished the season in second place in the drivers'"
            " championship. In 2019, Verstappen won nine races and finished"
            " the season in first place in the drivers' championship. He is"
            " the first Dutch driver to win the Formula One world"
            " championship."
        ),
        "comment": (
            "This is a valid question. Even though the response contains"
            " several instances of hallucinated information (e.g., Max"
            " Verstappen did not win the Formula Three European Championship in"
            " 2014), the primary answer in the response (Max Verstappen) is"
            " still accurate. Thus, the response is credited."
        ),
        "evaluation": "correct",
    },
]


# relaxed evaluation of the model answers for prompt 0, prompt 3, and prompt 4
# the gold answers are based on the contradicted passages (context 1 and context 2)
# the model should provide multiple correct answers based on different sources
prefix_contradict = (
    "Please evaluate the response to a question under relaxed evaluation, where"
    " hallucinations, outdated information, and ill-formed answers are allowed,"
    " as long as the primary answer is accurate. Note that for each question, there"
    " are multiple correct answers based on different sources even though these correct answers"
    " contradict each other."
    " Please credit the response"
    " only if it provides a list of confident and definitive answers that match with the gold answer list, or the correct"
    " answers can be obviously inferred from the response. The primary or final"
    " answers when standing alone must be accurate. Any additional information"
    " that is provided must not contradict the primary answers or reshape one's"
    " perception of them. For answers that"
    " involve names of entities (e.g., people), complete names or commonly"
    " recognized names are expected. Regarding numerical answers, approximate"
    " numbers are generally not accepted unless explicitly included in the"
    " ground-truth answers. We accept ill-formed responses (including those in"
    " a non-English language), as well as hallucinated or outdated information"
    " that does not significantly impact the primary answers."
)

demo_examples_contradict = [
    {
        "question": "How old is the world's oldest verified living person?",
        "correct_answers": [
            "116 years old",
            "112 years old",
        ],
        "response": (
            f"As of today {current_date}, the most up-to-date and relevant"
            " information regarding this query is as follows. The oldest"
            " verified living person is Maria Branyas Morera, who was born"
            " on March 4, 1907, making her 116 years old."
        ),
        "comment": (
            "This is a valid question. The primary answer in the response"
            " (116 years old) is partially accurate as it matched with one of the correct answers (116 years old, 112 years old)."
            " Thus, the response is partially credited."
        ),
        "evaluation": "partially correct",
    },
    {
        "question": "Is Tesla's stock price above $250 a share on March 8, 2020?",
        "correct_answers": ["No", "Yes"],
        "response": "No, it isn't. The stock price is at $257 on March 8, 2020."
        " But according to some sources, it is at $247 on March 8, 2020. So, yes it is above $250.",
        "comment": (
            "This is a valid question. While the primary answers in the"
            " response (No and Yes) are accurately matched with the correct answers, the additional information"
            " contradicts the primary answers as $257 is above $250 and $247 is below $250. Thus,"
            " the response is not credited."
        ),
        "evaluation": "incorrect",
    },
    {
        "question": "How many books has Colleen Hoover published in 2020?",
        "correct_answers": ["26 books", "27"],
        "response": (
            "according to some sources, Colleen Hoover"
            " has published 26 books in 2020. However, other sources claim that it has published 27 books in 2020."
        ),
        "comment": (
            "This is a valid question. The response accurately provides two answers (26 books and 27 books) based"
            " on different sources. These two answers match with all correct answers (26 books, 27). Thus, the response"
            " is credited."
        ),
        "evaluation": "correct",
    },
    {
        "question": "How many member nations exist in the United Nations?",
        "correct_answers": [
            "192 members",
            "191 member states",
        ],
        "response": (
            "As of my last update in April 2023, there were 193"
            " member nations in the United Nations. Please note that this"
            " information may have changed since then, so it is advisable to"
            " check the latest information from the United Nations or other"
            " reliable sources."
        ),
        "comment": (
            "This is a valid question. The"
            " primary answer in the response (193 member nations) does not match with any of the correct"
            " answers (191 member states or 192 members). Thus, the response is not credited."
        ),
        "evaluation": "incorrect",
    },
    {
        "question": (
            "Who won the biggest single-tournament payday in tennis history?"
        ),
        "correct_answers": ["Novak Djokovic", "Carlos Alcaraz"],
        "response": (
            "The biggest single-tournament payday in tennis history was won"
            " by Novak Djokovic at the 2021 Australian Open. He received a"
            " prize money of AUD 2.75 million (approximately USD 2.1 million)"
            " for winning the men's singles title."
        ),
        "comment": (
            "This is a valid question. Even though the additional provided"
            " information is outdated or hallucinated (by winning the 2022 ATP"
            " Finals title, Djokovic earned a record $4,740,300, the highest"
            " payout for a tournament winner in tennis), the primary answer in"
            " the response (Novak Djokovic) is still accurate and matches with one of correct answers."
            " But the response does not mention another correct answer (Carlos Alcaraz), thus, the"
            " response is partially credited."
        ),
        "evaluation": "partially correct",
    },
]

# form the prompts for the judge LLM
demo_questions_single = [ex["question"] for ex in demo_examples_single]
demo_evaluations_single = []
for ex in demo_examples_single:
    demo_evaluation = demo_evaluation_template.format(
      question=ex["question"],
      correct_answers=' | '.join(ex["correct_answers"]),
      response=ex["response"],
      comment=ex["comment"],
      evaluation=ex["evaluation"],
  )
    demo_evaluations_single.append(demo_evaluation)



demo_questions_contradict = [ex["question"] for ex in demo_examples_contradict]
demo_evaluations_contradict = []
for ex in demo_examples_contradict:
    demo_evaluation = demo_evaluation_template.format(
      question=ex["question"],
      correct_answers=' | '.join(ex["correct_answers"]),
      response=ex["response"],
      comment=ex["comment"],
      evaluation=ex["evaluation"],
  )
    demo_evaluations_contradict.append(demo_evaluation)


def load_testingdata(modelName):
    # load the testing json file
    testingUnits = []
    with open('/Users/yhou/git/wikiEvidenceVeracity/data/annotation/promptExp/LLM_Answers_Annotation/LLM_Answers_Model_' + modelName +'.json') as f:
        testingUnits = json.load(f)
    return testingUnits



def evaluateLLMAnswers_promptJudgeLLMs():
    # TestModels = ['mistralai/mixtral-8x7b-instruct-v0-1','mistralai/mistral-7b-instruct-v0-2', 'google/flan-ul2', 'meta-llama/llama-2-70b-chat', 'ibm/granite-13b-chat-v2','ibm/granite-13b-instruct-v2', 'ibm/granite-13b-lab-incubation', 'meta-llama/llama-2-7b-chat', 'meta-llama/llama-2-13b-chat']
    TestModels = ['mistralai/mixtral-8x7b-instruct-v0-1','mistralai/mistral-7b-instruct-v0-2', 'google/flan-ul2', 'meta-llama/llama-2-70b-chat', 'ibm/granite-13b-chat-v2','ibm/granite-13b-instruct-v2', 'ibm/granite-13b-lab-incubation', 'meta-llama/llama-2-7b-chat', 'meta-llama/llama-2-13b-chat']
    # load the json file from a folder
    for model in TestModels:
        model_name = str(model).split('/')[1]
        testingModelResultsFile = '/Users/yhou/git/wikiEvidenceVeracity/data/annotation/promptExp/LLM_Answers/LLM_Answers_Model_' + model_name +'.json'
        with open(testingModelResultsFile) as f:
            modelResults = json.load(f)
            judgeModels = ['meta-llama/llama-2-70b-chat', 'mistralai/mixtral-8x7b-instruct-v0-1', 'mistralai/mistral-7b-instruct-v0-2']
            prompt_all = {}
            evalprompts_all_prompt_0 = []
            evalprompts_all_prompt_1 = []
            evalprompts_all_prompt_2 = []
            evalprompts_all_prompt_3 = []
            evalprompts_all_prompt_4 = []
            for unit in modelResults:
                question = unit['question1']
                context1 = unit['question1_context1']
                answer1 = unit['question1_answer1']
                context2 = unit['question1_context2']
                answer2 = unit['question1_answer2']
                Samemessage = False
                # if samepassage is true, then there's no correct answer for the passage as the passage itself is contradictory
                if unit['samepassage'] == 'Same':
                    Samemessage = True

                prompt_0_answer = unit["ModelAnswer_prompt_0"] 
                prompt_0_gold_answer = [answer1, answer2]
                prompt_1_answer = unit["ModelAnswer_prompt_1"]
                prompt_1_gold_answer = [answer1]
                prompt_2_answer = unit["ModelAnswer_prompt_2"]
                prompt_2_gold_answer = [answer2]
                prompt_3_answer = unit["ModelAnswer_prompt_3"]
                prompt_3_gold_answer = [answer1, answer2]
                prompt_4_answer = unit["ModelAnswer_prompt_4"]
                prompt_4_gold_answer = [answer1, answer2]

                
                demo_prompts_contradict = []
                for q, e in zip(demo_questions_contradict, demo_evaluations_contradict):
                    demo_prompts_contradict.append(f'\n\n\nInput: question: {q}{e}')

                demo_prompts_single = []
                for q, e in zip(demo_questions_single, demo_evaluations_single):
                    demo_prompts_single.append(f'\n\n\nInput: question: {q}{e}')


                evaluation_0 = evaluation_template.format(
                correct_answers=' | '.join(prompt_0_gold_answer),
                response=prompt_0_answer.replace('\n', ' ').strip(),
                )

                evaluation_1 = evaluation_template.format(
                correct_answers=' | '.join(prompt_1_gold_answer),
                response=prompt_1_answer.replace('\n', ' ').strip(),
                )

                evaluation_2 = evaluation_template.format(
                correct_answers=' | '.join(prompt_2_gold_answer),
                response=prompt_2_answer.replace('\n', ' ').strip(),
                )


                evaluation_3 = evaluation_template.format(
                correct_answers=' | '.join(prompt_3_gold_answer),
                response=prompt_3_answer.replace('\n', ' ').strip(),
                )


                evaluation_4 = evaluation_template.format(
                correct_answers=' | '.join(prompt_4_gold_answer),
                response=prompt_4_answer.replace('\n', ' ').strip(),
                )


                wikieval_demo_contradict = ''.join(demo_prompts_contradict).strip()
                wikieval_demo_single = ''.join(demo_prompts_single).strip()

                eval_question_0 = f'\n\n\nInput: question: {question}{evaluation_0}'
                eval_question_1 = f'\n\n\nInput: question: {question}{evaluation_1}'
                eval_question_2 = f'\n\n\nInput: question: {question}{evaluation_2}'
                eval_question_3 = f'\n\n\nInput: question: {question}{evaluation_3}'
                eval_question_4 = f'\n\n\nInput: question: {question}{evaluation_4}'

                wiki_eval_0 = '[s][INST]' + prefix_contradict + '\n\n\n' + wikieval_demo_contradict + eval_question_0 + '[/INST]'
                wiki_eval_1 = '[s][INST]' + prefix_single + '\n\n\n' + wikieval_demo_single + eval_question_1 + '[/INST]'
                wiki_eval_2 = '[s][INST]' + prefix_single + '\n\n\n' + wikieval_demo_single + eval_question_2 + '[/INST]'
                wiki_eval_3 = '[s][INST]' + prefix_contradict + '\n\n\n' + wikieval_demo_contradict + eval_question_3 + '[/INST]'
                wiki_eval_4 = '[s][INST]' + prefix_contradict + '\n\n\n' + wikieval_demo_contradict + eval_question_4 + '[/INST]'
                # print(fresh_eval)
                # print(unit['ModelInput_prompt_1_' + model_name])
                evalprompts_all_prompt_0.append(wiki_eval_0)
                evalprompts_all_prompt_1.append(wiki_eval_1)
                evalprompts_all_prompt_2.append(wiki_eval_2)
                evalprompts_all_prompt_3.append(wiki_eval_3)
                evalprompts_all_prompt_4.append(wiki_eval_4)
            prompt_all['prompt_0'] = evalprompts_all_prompt_0
            prompt_all['prompt_1'] = evalprompts_all_prompt_1
            prompt_all['prompt_2'] = evalprompts_all_prompt_2
            prompt_all['prompt_3'] = evalprompts_all_prompt_3
            prompt_all['prompt_4'] = evalprompts_all_prompt_4
            
            # yields batch of results that are produced asynchronously and in parallel
            for judgeModel in judgeModels:
                judgemodel_name = str(judgeModel).split('/')[1]
                for prompt_name, prompts in prompt_all.items():
                    print(f"Using {judgemodel_name} to evaluate the answers of the model {model_name} with prompt {prompt_name}")
                    for idx, response in tqdm(
                        enumerate(
                            client.text.generation.create(
                                model_id=judgeModel,
                                inputs=prompts,
                                # set to ordered to True if you need results in the same order as prompts
                                execution_options=CreateExecutionOptions(ordered=True),
                                parameters=parameters,
                            )
                        ),
                        total=len(prompts),
                        desc="Progress",
                        unit="input",
                    ):
                        result = response.results[0]
                        modelResults[idx]["JudgeModelAnswer" + "_" + prompt_name + "_" + model_name + "_JudgeModelName_"+judgemodel_name] = result.generated_text
                        modelResults[idx]["JudgeModelInput" + "_" + prompt_name + "_" + model_name + "_JudgeModelName_"+judgemodel_name] = result.input_text
                        # is_valid_eval, eval = extract_ratings(result.generated_text)
                        # if is_valid_eval:
                        #     modelResults[idx]["JudgeModelRating" + "_" + prompt_name + "_" + model_name + "_JudgeModelName_"+judgemodel_name] = eval['rating']
                        # else:
                        #     modelResults[idx]["JudgeModelRating" + "_" + prompt_name + "_" + model_name + "_JudgeModelName_"+judgemodel_name] = None
                        # print(f"Input text ({idx}): {result.input_text}")
                        # print(f"Generated text ({idx}): {result.generated_text}")
        # saving the judge model prompting results to a json file
        testingModel_JudgeModelResultFile = '/Users/yhou/git/wikiEvidenceVeracity/data/annotation/promptExp/LLM_Answers_LLMJudge/LLM_Answers_Model_' + model_name + '_Eval_LLMJudge'+'.json'
        with open(testingModel_JudgeModelResultFile, 'w') as f:
            json.dump(modelResults, f, indent=4)
    
def extractJudgeRating_ReportLLMJudgeResults():
    # TestModels = ['mistralai/mixtral-8x7b-instruct-v0-1','mistralai/mistral-7b-instruct-v0-2', 'google/flan-ul2', 'meta-llama/llama-2-70b-chat', 'ibm/granite-13b-chat-v2','ibm/granite-13b-instruct-v2', 'ibm/granite-13b-lab-incubation', 'meta-llama/llama-2-7b-chat', 'meta-llama/llama-2-13b-chat']

    TestModels = ['mistralai/mixtral-8x7b-instruct-v0-1','mistralai/mistral-7b-instruct-v0-2', 'google/flan-ul2']

    
    judgeModels = ['meta-llama/llama-2-70b-chat', 'mistralai/mixtral-8x7b-instruct-v0-1', 'mistralai/mistral-7b-instruct-v0-2']

    AllResults_report = {}

    for model in TestModels:
        model_name = str(model).split('/')[1]
        testingModel_JudgeModelResultFile = '/Users/yhou/git/wikiEvidenceVeracity/data/annotation/promptExp/LLM_Answers_LLMJudge/LLM_Answers_Model_' + model_name + '_Eval_LLMJudge'+'.json'
        with open(testingModel_JudgeModelResultFile) as f:
            modelResults = json.load(f)
            modelResults_report = {}
            for judgeModel in judgeModels:
                judgemodel_name = str(judgeModel).split('/')[1]
                modelResults_report [judgemodel_name] = {}
                for prompt_name in ['prompt_0', 'prompt_1', 'prompt_2', 'prompt_3', 'prompt_4']:
                    true_count = 0
                    partial_correct_count = 0
                    false_count = 0
                    None_count = 0
                    for unit in modelResults:
                        judgeModelResponse = unit["JudgeModelAnswer" + "_" + prompt_name + "_" + model_name + "_JudgeModelName_"+judgemodel_name]
                        correct_answers = unit['goldAnswer_' + prompt_name]
                        judgeModelPrompt = unit["JudgeModelInput" + "_" + prompt_name + "_" + model_name + "_JudgeModelName_"+judgemodel_name]
                        response = unit["ModelAnswer" + "_" + prompt_name].replace('\n', ' ').strip()
                        if prompt_name == 'prompt_0' or prompt_name == 'prompt_3' or prompt_name == 'prompt_4':
                            is_valid_eval, eval = extract_ratings_contradict(judgeModelResponse)
                        else:
                            is_valid_eval, eval = extract_ratings_single(judgeModelResponse)

                        if is_valid_eval:
                            unit["JudgeModelRating" + "_" + prompt_name + "_" + model_name + "_JudgeModelName_"+judgemodel_name] = eval['rating']
                            judgeModelRating = eval['rating']
                        else:
                            unit["JudgeModelRating" + "_" + prompt_name + "_" + model_name + "_JudgeModelName_"+judgemodel_name] = None
                            judgeModelRating = None

                        # print("###################################")
                        # print(f"Correct answers: {correct_answers}")
                        # print(f"Model response: {response}")
                        # print("judgeResponse:",judgeModelResponse)
                        # print(f"Model {model_name} with prompt {prompt_name} is rated as {judgeModelRating} by the judge model {judgemodel_name}")
                        # print("JudgeModelPrompt", judgeModelPrompt)
                        if judgeModelRating == 'TRUE':
                            true_count += 1
                        elif judgeModelRating == 'FALSE':
                            false_count += 1
                        elif judgeModelRating == 'PARTIALLY TRUE':
                            partial_correct_count += 1
                        else:
                            None_count += 1
                    
                    modelResults_report[judgemodel_name][prompt_name] = [true_count, partial_correct_count, false_count, None_count]
                    # print(f"question count: {len(modelResults)}")        
                    print(f"Model {model_name} with prompt template {prompt_name} is rated as True {true_count} times, partially-true {partial_correct_count} times, False {false_count} times, and None {None_count} times by the judge model {judgemodel_name}")
        AllResults_report[model_name] = modelResults_report
    print(AllResults_report)

def extract_ratings_single(response):
    evaluation = None
    response = response.lower()
    for line in response.split('\n'):
        if 'evaluation: ' in line:
            evaluation = line.split(' ')[-1]
        if evaluation not in ['correct', 'incorrect', 'correct.', 'incorrect.']:
            return False, {'rating': None}
        if evaluation == 'incorrect' or evaluation == 'incorrect.':
            evaluation = 'FALSE'
        if evaluation == 'correct' or evaluation == 'correct.':
            evaluation = 'TRUE'
    if evaluation is None:
        if 'Thus, the response is credited.' in response:
            evaluation = 'TRUE'
        elif 'Thus, the response is not credited.' in response:
            evaluation = 'FALSE'
        else:
            return False, {'rating': None}
    return True, {'rating': evaluation}      


def extract_ratings_contradict(response):
    evaluation = None
    response = response.lower()
    for line in response.split('\n'):
        if 'evaluation: ' in line:
            # evaluation = line.split(' ')[-1]
            evaluation = line.split('evaluation: ')[-1].strip()
            if evaluation not in ['correct', 'incorrect', 'correct.', 'incorrect.', 'partially correct', 'partially correct.']:
                return False, {'rating': None}
            if evaluation == 'partially correct' or evaluation == 'partially correct.':
                evaluation = 'PARTIALLY TRUE'
            elif evaluation == 'incorrect' or evaluation == 'incorrect.':
                evaluation = 'FALSE'
            elif evaluation == 'correct' or evaluation == 'correct.':
                evaluation = 'TRUE'
    if evaluation is None:
        if 'the response is credited.' in response:
            evaluation = 'TRUE'
        elif 'the response is not credited.' in response:
            evaluation = 'FALSE'
        elif 'the response is partially credited.' in response:
            evaluation = 'PARTIALLY TRUE'
        else:
            return False, {'rating': None}
    return True, {'rating': evaluation}                
                

    
if __name__ == "__main__":  
    # evaluateLLMAnswers_promptJudgeLLMs()
    extractJudgeRating_ReportLLMJudgeResults()
    
