import json
import os
import re
import random

from tqdm import tqdm

label_mapping = {
    "REG": "reagent",
    "Container":"container",
    "Device":"device",
    "Time":"time",
    "Temperature":"temperature",
    "Mass":"mass",
    "Speed":"speed",
    "Concentration":"concentration",
    "Volume":"volume",
    "Length":"length",
    "String":"string",
    "Force":"force",
    "Bool":"bool",
    "Voltage":"voltage",
    "Frequency":"frequency"
}

label_lists = set()

def format(annotated_corpora, action):
    annotated_corpora = annotated_corpora[:]
    origin = re.findall(r'\[([^[\]]*)\]', annotated_corpora)
    label = re.findall(r'\{([^}]*)\}', annotated_corpora)

    for ori in origin:
        annotated_corpora = annotated_corpora.replace("["+ori+"]", ori)

    for lab in label:
        label_lists.add(label_mapping[lab])
        annotated_corpora = annotated_corpora.replace("{"+lab+"}", "")

    paramter = ",".join([label_mapping[lab]+"=\""+re.sub(r'\s+', ' ', ori.replace("\"", "").replace("'", ""))+"\"" for ori, lab in zip(origin, label) if lab in label_mapping])

    if len(paramter) == 0:
        return None, None
    else:
        return annotated_corpora, "(action=\""+action.lower()+"\","+paramter+");"

grammar_origin = """%ignore /\s+/
%import common.ESCAPED_STRING

program : sentence
    | sentence program
sentence: "(action=" action_name "," variable_list ");"
variable_list : variable
    | variable_list "," variable
variable : "reagent=" object
*-*-*-
action_name: ESCAPED_STRING
object : ESCAPED_STRING"""

if __name__ == "__main__":
    folder_path = "data/original_protocol"
    classification = {
        "Molecular Biology & Genetics": [],
        "Biomedical & Clinical Research": [],
        "Ecology & Environmental Biology": [],
        "Bioengineering & Technology": [],
        "Bioinformatics & Computational Biology": []
    }
    file_paths = {
        "Molecular Biology & Genetics": "data/protocol_list/Genetics.json",
        "Biomedical & Clinical Research": "data/protocol_list/Medical.json",
        "Ecology & Environmental Biology": "data/protocol_list/Ecology.json",
        "Bioengineering & Technology": "data/protocol_list/BioEng.json"
    }
    dsl_paths = {
        "Molecular Biology & Genetics": "data/autodsl/Genetics.json",
        "Biomedical & Clinical Research": "data/autodsl/Medical.json",
        "Ecology & Environmental Biology": "data/autodsl/Ecology.json",
        "Bioengineering & Technology": "data/autodsl/BioEng.json"
    }
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as json_file:
                data = json.load(json_file)
                areas = data["bigAreas"]
                for area in areas:
                    if area in classification:
                        classification[area].append(' '.join(data["procedures"]))
                    else:
                        print("Error!")
                        raise "Area does not match!"

    for category, file_path in file_paths.items():
        subject_name = file_path.split("/")[2].replace(".json", "")
        print(subject_name)

        grammar_path = "data/grammar/" + subject_name + ".lark"
        dsl_path = dsl_paths[category]

        data = classification[category]
        remake_data = []
        for s in tqdm(data):
            a = re.sub(r'[ \t]+', ' ', s)
            a = re.sub(r'<[^>]*>', '', a)
            remake_data.append(a)

        with open(file_path, "w") as f:
            json.dump(remake_data, f, indent=2)

        with open(dsl_path, 'r') as f:
            data = json.load(f)
        store = []
        label_lists = set()
        for action in data:
            pattern_list = data[action]
            for pattern in pattern_list:
                example_list = pattern["example"]
                for example in example_list:
                    s = example.replace("\n", "")
                    s = re.sub(r'\s+', ' ', s)
                    src, tgt = format(s, action)
                    if src == None:
                        continue
                    store.append({"a":src, "b":tgt})


        new_grammar = grammar_origin.replace("*-*-*-", "\n".join(["    | " + f'"{label}=" object' for label in label_lists if label != "reagent"]))
        # with open(grammar_path, 'w') as file:
        #     file.write(new_grammar)
