import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler

from dataset import save_dataset

import os

# files = ['column', 'digits4', 'iris', 'wine', 'banknote', 'bcc', 'penguin', 'seeds']
files = ['banknote', 'bcc', 'penguin', 'iran_customer_churn', 'column', 'digits4', 'seeds', 'occupancy_detection', 'fico', 'australian_credit', 'give_credit']

# [banknote, bcc, penguin, iranian customer churn, occupancy detection, telco, fico]

for file in files:
    if file == 'somerville':
        df = pd.read_csv(f'exponential_loss_csvs/csv/{file}.csv', dtype=float, encoding="utf-16")
    elif file in ['occupancy_detection', 'fico']:
        df = pd.read_csv(f'exponential_loss_csvs/csv/{file}.csv')
    else:
        df = pd.read_csv(f'exponential_loss_csvs/csv/{file}.csv', dtype=float)

    columns = df.columns
    if file == 'somerville':
        y_c = 'D'
    elif file == 'fico':
        y_c = 'RiskPerformance'
        df[y_c] = df[y_c].astype('category')
    else:
        y_c = columns[-1]

    assert df.isnull().values.any() == False

    y = df[y_c]
    X = df.drop(columns=[y_c])

    if file == 'wine':
        y = y-1
    elif file == 'occupancy_detection':
        X = X.drop(columns=['date'])
    elif file == 'fico':
        y = y.cat.codes

    continuous = (X.nunique() > 5).astype('int').to_numpy()

    X = X.to_numpy()
    y = y.to_numpy()

    min_max_scaler = MinMaxScaler()
    X = min_max_scaler.fit_transform(X)

    # if file == 'wine_pca4' or file == 'wine':
    #     y = 2-y
    
    if not os.path.exists(f'exponential_loss_csvs/datasets'):
        os.mkdir(f'exponential_loss_csvs/datasets')

    if not os.path.exists(f'exponential_loss_csvs/datasets/{file}'):
        os.mkdir(f'exponential_loss_csvs/datasets/{file}/')


    y0 = y
    y1 = y.copy()
    y1[y1 == 0] = -1

    save_dataset(f'exponential_loss_csvs/datasets/{file}', X, y0, y1)
    np.save(f'exponential_loss_csvs/datasets/{file}/filter', continuous)