import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.ArffSaver;
import weka.core.converters.CSVSaver;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.AddExpression;
import weka.filters.unsupervised.instance.RemoveDuplicates;
import weka.filters.unsupervised.instance.RemoveWithValues;
import java.io.File;
import java.util.Random;


public class data_preparation {
    public static void main(String args[]) throws Exception {


        //IMPORTING + COMBINING DATA

        //Import CSV data set (intial housing data set)
        DataSource src = new DataSource("./housing.csv");   //create DataSource pointing to data set
        Instances housing = src.getDataSet();   //load data set into instances
        //System.out.println(housing);   //show data set

        //Import ARFF data set (with new columns for housing)
        DataSource src2 = new DataSource("./housing_newColumns.arff");   //create DataSource pointing to data set
        Instances housing_newcol = src2.getDataSet();   //load data set into instances
        //System.out.println(housing_newcol);   //show data set

        //Merge 2 data sets (column-vise)
        Instances housing_new = Instances.mergeInstances(housing, housing_newcol);   //merging both data sets
        //System.out.println(housing_new);   //show new data set

        //Import CSV data set (with new rows for housing_new)
        DataSource src3 = new DataSource("./housing_newRows.csv");   //create DataSource pointing to data set
        Instances housing_newrow = src3.getDataSet();   //load data set into instances
        //System.out.println(housing_newrow);   //show data set

        //Append 2 data sets (rows-vise)
        for(int i = 0; i < housing_newrow.numInstances(); i++) {   //for each instance in housing_newrow
           housing_new.add(housing_newrow.instance(i));   //append the instance to housing_new
        }


        //UNDERSTANDING DATA

        //Print out first 5 Instances
        for (int i = 0; i < 5; i++) {   //for each integer between 0 and 4
            System.out.println(housing_new.get(i));   //print out the instance with the integers index
        }

        //Get Summary
        System.out.println(housing_new.toSummaryString());   //print out a string containing a summary of the data set

        //Get mean/mode from each variable
        for (int i = 0; i < housing_new.numAttributes(); i++) {   //for each variable
            if(housing_new.attribute(i).type() == 0) {   //if the variable is numeric
                System.out.println("The mean of the " + housing_new.attribute(i).name() + " variable is: " + housing_new.meanOrMode(i));   //print out the mean
            }
            else{   //id the variable is not numeric
                int index = (int) housing_new.meanOrMode(i);   //get the index of the mode value
                System.out.println("The mode of the " + housing_new.attribute(i).name() + " variable is: " + housing_new.attribute(i).value(index));   //print out the mode
            }
        }

        //Get min/max value from all numeric variables
        for (int i = 0; i < housing_new.numAttributes(); i++) {   //for each variable
            if(housing_new.attribute(i).type() == 0){   //if the variable is numeric
                System.out.println("The smallest Value of the " + housing_new.attribute(i).name() + "variable is: " + housing_new.kthSmallestValue(i, 1));   //print out the smallest value
                System.out.println("The biggest Value of the " + housing_new.attribute(i).name() + "variable is: " + housing_new.kthSmallestValue(i, housing_new.numInstances()));   //print out the biggest value
            }
        }


        //SELECTING DATA

        //Delete unnecessary Attribute (balcony_y/n)
        housing_new.deleteAttributeAt(6);  //delete attribute at position 6

        //Filter every value below 170000
        RemoveWithValues filter_min = new RemoveWithValues();   //create filter

        String[] options = new String[4];  //give the options for the filter
        options[0] = "-C";   //which column
        options[1] = "5";   //5th column
        options[2] = "-S";  //select instance with smaller values than S
        options[3] = "170000";   //S = 170,000
        filter_min.setOptions(options);   //set options

        filter_min.setInputFormat(housing_new);   //set input for Filter (data set)
        Instances housing_expensive = Filter.useFilter(housing_new, filter_min);   //use the filter
        //System.out.println(housing_expensive);   //print out new data set


        //CLEANING DATA

        //Duplicate Detection
        RemoveDuplicates filter = new RemoveDuplicates();   //create filter
        filter.setInputFormat(housing_expensive);   //set input for Filter (data set)
        Instances housing_nodup = Filter.useFilter(housing_expensive, filter);   //use the filter
        //System.out.println(filteredData);   //print out new data set

        //Rename Attribute
        housing_nodup.renameAttribute(5,"rooms");   //rename column 5 to rooms
        //System.out.println(housing_expensive);   //print out new data set

        //Remove missing values
        housing_nodup.removeIf(Instance::hasMissingValue);   //if row contains missing value remove the row


        //CREATE NEW DATA

        //Create new column
        AddExpression addExpressionFilter = new AddExpression();   //create filter

        addExpressionFilter.setExpression("a5 / a4");   //how should the new column look like (fifth column divided by forth column)
        addExpressionFilter.setName("price_per_sqrm");   //set the name of the new column
        addExpressionFilter.setInputFormat(housing_nodup);   //set input for Filter (data set)

        Instances housing_prepared = Filter.useFilter(housing_nodup, addExpressionFilter);   //use the filter
        System.out.println(housing_prepared);   //print out new data set


        //SPLITTING THE DATA

        //Split Data set into training and testing data set
        int seed = 42;   //set seed for randomization

        double trainPercentage = 80.0;   //set percentage size of training data

        housing.randomize(new Random(seed));   //randomize data with seed

        int trainSize = (int) Math.round(housing.numInstances() * trainPercentage / 100.0);   //calculate training data size
        int testSize = housing.numInstances() - trainSize;   //calculate testing data size

        Instances trainData = new Instances(housing, 0, trainSize);   //create training data
        Instances testData = new Instances(housing, trainSize, testSize);   //create testing data

        //System.out.println(trainData);   //print out training data
        //System.out.println(testData);   //print out testing data


        //SAVING DATA

        //ARFF
        ArffSaver sv_arff = new ArffSaver();   //create saving instance
        sv_arff.setInstances(housing_prepared);   //set instances which should be saved
        sv_arff.setFile(new File("./output.arff"));   //set file path and file name of the new data set
        sv_arff.writeBatch();   //write the data into the file

        //CSV
        /*
        CSVSaver sv_csv = new CSVSaver();   //create saving instance
        sv_csv.setInstances(housing_prepared);   //set instances which should be saved
        sv_csv.setFile(new File("./output.csv"));   //set file path and file name of the new data set
        sv_csv.writeBatch();   //write the data into the file
        */

    }
}