import glob
import os

import pandas as pd
from tableshift.core.features import FeatureList

from rtfm.data_sources.unipredict import format_target_column
from rtfm.task_config import TLMConfig


def clean_colname(colname: str) -> str:
    colname = colname.replace(".", "_")
    colname = colname.replace("__", "_")
    return colname


def generate_files_from_csv(
    task_input_dir: str, task_output_dir: str, task: str, to_regression: bool = True
):
    """Find the CSV file, read it, preprocess it, and write the results to output_dir."""
    fileglob = os.path.join(task_input_dir, "*.csv")
    csv_files = glob.glob(fileglob)

    assert (
        len(csv_files) == 1
    ), f"expected one csv file matching {fileglob}, got {csv_files}"

    csv_src = csv_files[0]
    df = pd.read_csv(csv_src)

    df.columns = [clean_colname(colname) for colname in df.columns]

    target_colname = df.columns[-1]

    if to_regression:
        df[target_colname] = format_target_column(df[target_colname])

    fl = FeatureList.from_dataframe(df, target_colname)

    os.makedirs(task_output_dir, exist_ok=True)

    feature_list_jsonl = os.path.join(task_output_dir, "feature_list.jsonl")
    fl.to_jsonl(feature_list_jsonl)
    print(f"[INFO] FeatureList written to {feature_list_jsonl}")

    task_config = TLMConfig(
        prefix=f"Predict the value of {target_colname}",
        suffix=f"What is the value of {target_colname}?",
        task_context=None,
        labels_mapping=None,
        label_values=df[target_colname].unique().tolist(),
    )

    task_config_yaml_path = os.path.join(task_output_dir, f"{task}.yaml")
    task_config.to_yaml(task_config_yaml_path)
    print(f"[INFO] TaskConfig written to {task_config_yaml_path}")

    csv_dest = os.path.join(task_output_dir, f"{task}.csv")

    df.to_csv(csv_dest, index=False)
    return
