import json
import os
import re
import random

label_mapping = {
    "REG": "reagent",
    "Container":"container",
    "Device":"device",
    "Time":"time",
    "Temperature":"temperature",
    "Mass":"mass",
    "Speed":"speed",
    "Concentration":"concentration",
    "Volume":"volume",
    "Length":"length",
    "String":"string",
    "Force":"force",
    "Bool":"bool",
    "Voltage":"voltage",
    "Frequency":"frequency"
}

label_lists = set()

def format(annotated_corpora, action):
    annotated_corpora = annotated_corpora[:]
    origin = re.findall(r'\[([^[\]]*)\]', annotated_corpora)
    label = re.findall(r'\{([^}]*)\}', annotated_corpora)

    for ori in origin:
        annotated_corpora = annotated_corpora.replace("["+ori+"]", ori)

    for lab in label:
        label_lists.add(label_mapping[lab])
        annotated_corpora = annotated_corpora.replace("{"+lab+"}", "")

    paramter = ",".join([label_mapping[lab]+"=\""+re.sub(r'\s+', ' ', ori.replace("\"", "").replace("'", ""))+"\"" for ori, lab in zip(origin, label) if lab in label_mapping])

    if len(paramter) == 0:
        return None, None
    else:
        return annotated_corpora, "(action=\""+action.lower()+"\","+paramter+",output=\"\""+");"

grammar_origin = """%ignore /\s+/
%import common.ESCAPED_STRING

program : sentence
    | sentence program
sentence: "(action=" action_name "," variable_list ");"
variable_list : variable
    | variable_list "," variable
variable : "reagent=" object
    | "output=" object
*-*-*-
action_name: ESCAPED_STRING
object : ESCAPED_STRING"""

if __name__ == "__main__":
    file_paths = {
        "Molecular Biology & Genetics": "Genetics",
        "Biomedical & Clinical Research": "Medical",
        "Ecology & Environmental Biology": "Ecology",
        "Bioengineering & Technology": "BioEng"
    }
    dsl_paths = {
        "Molecular Biology & Genetics": "molecular_biology_and_genetics_dsl.json",
        "Biomedical & Clinical Research": "biomedical_and_clinical_research_dsl.json",
        "Ecology & Environmental Biology": "ecology_and_environmental_environmental_dsl.json",
        "Bioengineering & Technology": "bioengineering_and_technology_dsl.json"
    }

    for category, dir_name in file_paths.items():
        train_src = "data/" + dir_name + "/train.src"
        train_tgt = "data/" + dir_name + "/train.tgt"
        grammar_path = "grammars/" + dir_name + ".lark"
        dsl_result = "data/dsl_tag_result/" + dsl_paths[category]

        with open(train_src, 'w') as file:
            file.write("")
        with open(train_tgt, 'w') as file:
            file.write("")
        with open(grammar_path, 'w') as file:
            file.write("")

        with open(dsl_result, 'r') as f:
            data = json.load(f)
        store = []
        label_lists = set()
        for action in data:
            pattern_list = data[action]
            for pattern in pattern_list:
                example_list = pattern["example"]
                for example in example_list:
                    s = example.replace("\n", "")
                    s = re.sub(r'\s+', ' ', s)
                    src, tgt = format(s, action)
                    if src == None:
                        continue
                    store.append({"a":src, "b":tgt})


        new_grammar = grammar_origin.replace("*-*-*-", "\n".join(["    | " + f'"{label}=" object' for label in label_lists if label != "reagent"]))
        with open(grammar_path, 'w') as file:
            file.write(new_grammar)

        random.shuffle(store)
        train_group = [store[i:i + 5] for i in range(0, len(store), 5)]
        for a in train_group:
            src = "".join([b["a"] for b in a])
            tgt = "".join([b["b"] for b in a])
            with open(train_src, 'a') as file:
                file.write(src + "\n")
            with open(train_tgt, 'a') as file:
                file.write(tgt + "\n")
