# -*- coding: utf-8 -*-
"""
Created on Wed Jan 27 08:53:02 2021

@author: niels
"""

# -*- coding: utf-8 -*-
"""
Created on Fri Dec 11 10:58:32 2020

@author: niels
"""

import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd

pandapoints=pd.read_excel(r"C:\Users\niels\Documents\LTT\RegressionClustering\GuDonetwo.xlsx")
i=3 #number of iterations for Multiple Regression Clustering
number_of_clusters=2 #number of clusters
patience=10 #patience; algorithm finishes after not getting better patience times
operating_points_0=pd.read_csv(r"C:\Users\niels\Documents\LTT\RegressionClustering\GuDonetwo.CSV", delimiter=";",decimal=",")
operating_points_3=pd.read_csv(r"C:\Users\niels\Documents\LTT\RegressionClustering\GuD_3.CSV", delimiter=";",decimal=",")
operating_points_1=pd.read_csv(r"C:\Users\niels\Documents\LTT\RegressionClustering\GuDkomplett.CSV", delimiter=";",decimal=",")
operating_points_2=pd.read_csv(r"C:\Users\niels\Documents\LTT\RegressionClustering\operating_points_testLarge.CSV",delimiter=";",decimal=",")
all_components={"component_0": {"state_0": operating_points_1}, "component_1": {"state_0": operating_points_2}}
one_state={"state_0": operating_points_1}
two_states={"state_0": operating_points_0, "state_1:":operating_points_3}
colors=['red', 'blue','green','black','orange','cyan']
def MultiRC(component,state,number_of_clusters,i=50,patience=10,maxIterations=5000, preformatted=False): 
    temp_TRS=0
    TRS=0
    clusters=list()
    for j in range(i):
        temp_clusters, temp_TRS= RC(component,state, number_of_clusters,patience,maxIterations,preformatted)
        if temp_TRS>TRS:
            TRS=temp_TRS
            clusters=temp_clusters
        print("Iteration",j," completed")
    print("Clustering completed")
    return clusters

def RC (component, state, number_of_clusters,patience=10,maxIterations=5000, preformatted=False):
    x=[]
    y=[]
    if preformatted==False:
        for p in component[state]:   #get Input as x array, Ouput as y array by header
            if '(in)' in p:
                for i in component[state][p]:
                    x.append(i)
        for p in component:
            if '(out)' in p:
                for i in component[state][p]:
                    y.append(i)
        if len(x)<1:    #if header is misspelled or similar, use first column as Input and second column as Ouput data
            for i in component[state].iloc[:,0]:
                x.append(i)
        if len(y)<1:
            for i in component[state].iloc[:,1]:
                y.append(i)
    if preformatted==True:
        for i in component[state].iloc[:,0]:
            x.append(i)
        for i in component[state].iloc[:,1]:
            y.append(i)
    clusters= list() #list with all clusters
    df_clusters=[] #list with all clusters as pandas DataFrames
    functions= list() #list with a foundational function for each cluster
    TRS=0 #Total Regression Squared
    CTRS=0 #Compared Total Regression Squared
    patience_count=0
    patientTRS=[]
    patientClusters=[]
    #initialize clusters with random assignment of datapoints to clusters
    X = np.reshape(x,(-1,1)) #reshaping for regression function
    for i in range(number_of_clusters):
        clusters.append(list())
    for i in range(len(X)):
        clusters[np.random.randint(0,number_of_clusters)].append([X[i],y[i]])
    #initial linear regression for each cluster
    for cluster in clusters:
        cx=list()#structure all data points in a cluster as x and y lists for regression function
        cy=list()
        for i in cluster:
            cx.append(i[0])
            cy.append(i[1])
        function=LinearRegression().fit(cx,cy)
        functions.append(function)
        TRS+=function.score(cx,cy)
    for i in range(maxIterations): #repeats until either best clusters are returned or maximum Iterations are reached to prevent running endlessly
        #calculates minimal distanced cluster for each data point and assigns data point to it
        for c_idx, cluster in enumerate(clusters):
            for i in cluster:
                minDistance=(max(y)-min(y))**2+1
                bestfct=0
                for f_idx,function in enumerate(functions):
                    distance= (abs(function.predict(np.reshape(i[0],(-1,1)))-i[1]))**2
                    if distance < minDistance :
                        minDistance=distance
                        bestfct=f_idx
                if bestfct!=c_idx:
                    cluster.remove(i)
                    clusters[bestfct].append(i)
        #New regression with changed clusters
        functions.clear()
        for cluster in clusters:
            cx=list()
            cy=list()
            for i in cluster:
                cx.append(i[0])
                cy.append(i[1])
            function=LinearRegression().fit(cx,cy)
            functions.append(function)
            CTRS+=function.score(cx,cy)
        if CTRS<=TRS : #if Total Regression Squared is maximized, regression clustering is complete
            patience_count+=1
            patientTRS.append(TRS)
            patientClusters.append(clusters)
            if patience_count >= patience: #only finishes if TRS does not improve patience times
                TRS=max(patientTRS)
                clusters=patientClusters[np.argmax(patientTRS)]
                print("TRS:", TRS)
                for cluster in clusters: #Bringing clusters in pandas format
                    for i in cluster:
                        i[0]=i[0][0]
                    df=pd.DataFrame(cluster, columns=[0,1])
                    df_clusters.append(df)
                return df_clusters, TRS
        TRS=CTRS
        CTRS=0
    print("No solution found in maximum number of Iterations.")
    
    
df_list, TRS=MultiRC(two_states,'state_0',number_of_clusters,i,patience)
for idx, df in enumerate(df_list):
    df.plot.scatter(0,1, c=colors[idx])