# -*- coding: utf-8 -*- """ Created on Wed Jan 27 08:53:02 2021 @author: niels """ # -*- coding: utf-8 -*- """ Created on Fri Dec 11 10:58:32 2020 @author: niels """ import numpy as np from sklearn.linear_model import LinearRegression import pandas as pd pandapoints=pd.read_excel(r"C:\Users\niels\Documents\LTT\RegressionClustering\GuDonetwo.xlsx") i=3 #number of iterations for Multiple Regression Clustering number_of_clusters=2 #number of clusters patience=10 #patience; algorithm finishes after not getting better patience times operating_points_0=pd.read_csv(r"C:\Users\niels\Documents\LTT\RegressionClustering\GuDonetwo.CSV", delimiter=";",decimal=",") operating_points_3=pd.read_csv(r"C:\Users\niels\Documents\LTT\RegressionClustering\GuD_3.CSV", delimiter=";",decimal=",") operating_points_1=pd.read_csv(r"C:\Users\niels\Documents\LTT\RegressionClustering\GuDkomplett.CSV", delimiter=";",decimal=",") operating_points_2=pd.read_csv(r"C:\Users\niels\Documents\LTT\RegressionClustering\operating_points_testLarge.CSV",delimiter=";",decimal=",") all_components={"component_0": {"state_0": operating_points_1}, "component_1": {"state_0": operating_points_2}} one_state={"state_0": operating_points_1} two_states={"state_0": operating_points_0, "state_1:":operating_points_3} colors=['red', 'blue','green','black','orange','cyan'] def MultiRC(component,state,number_of_clusters,i=50,patience=10,maxIterations=5000, preformatted=False): temp_TRS=0 TRS=0 clusters=list() for j in range(i): temp_clusters, temp_TRS= RC(component,state, number_of_clusters,patience,maxIterations,preformatted) if temp_TRS>TRS: TRS=temp_TRS clusters=temp_clusters print("Iteration",j," completed") print("Clustering completed") return clusters def RC (component, state, number_of_clusters,patience=10,maxIterations=5000, preformatted=False): x=[] y=[] if preformatted==False: for p in component[state]: #get Input as x array, Ouput as y array by header if '(in)' in p: for i in component[state][p]: x.append(i) for p in component: if '(out)' in p: for i in component[state][p]: y.append(i) if len(x)<1: #if header is misspelled or similar, use first column as Input and second column as Ouput data for i in component[state].iloc[:,0]: x.append(i) if len(y)<1: for i in component[state].iloc[:,1]: y.append(i) if preformatted==True: for i in component[state].iloc[:,0]: x.append(i) for i in component[state].iloc[:,1]: y.append(i) clusters= list() #list with all clusters df_clusters=[] #list with all clusters as pandas DataFrames functions= list() #list with a foundational function for each cluster TRS=0 #Total Regression Squared CTRS=0 #Compared Total Regression Squared patience_count=0 patientTRS=[] patientClusters=[] #initialize clusters with random assignment of datapoints to clusters X = np.reshape(x,(-1,1)) #reshaping for regression function for i in range(number_of_clusters): clusters.append(list()) for i in range(len(X)): clusters[np.random.randint(0,number_of_clusters)].append([X[i],y[i]]) #initial linear regression for each cluster for cluster in clusters: cx=list()#structure all data points in a cluster as x and y lists for regression function cy=list() for i in cluster: cx.append(i[0]) cy.append(i[1]) function=LinearRegression().fit(cx,cy) functions.append(function) TRS+=function.score(cx,cy) for i in range(maxIterations): #repeats until either best clusters are returned or maximum Iterations are reached to prevent running endlessly #calculates minimal distanced cluster for each data point and assigns data point to it for c_idx, cluster in enumerate(clusters): for i in cluster: minDistance=(max(y)-min(y))**2+1 bestfct=0 for f_idx,function in enumerate(functions): distance= (abs(function.predict(np.reshape(i[0],(-1,1)))-i[1]))**2 if distance < minDistance : minDistance=distance bestfct=f_idx if bestfct!=c_idx: cluster.remove(i) clusters[bestfct].append(i) #New regression with changed clusters functions.clear() for cluster in clusters: cx=list() cy=list() for i in cluster: cx.append(i[0]) cy.append(i[1]) function=LinearRegression().fit(cx,cy) functions.append(function) CTRS+=function.score(cx,cy) if CTRS<=TRS : #if Total Regression Squared is maximized, regression clustering is complete patience_count+=1 patientTRS.append(TRS) patientClusters.append(clusters) if patience_count >= patience: #only finishes if TRS does not improve patience times TRS=max(patientTRS) clusters=patientClusters[np.argmax(patientTRS)] print("TRS:", TRS) for cluster in clusters: #Bringing clusters in pandas format for i in cluster: i[0]=i[0][0] df=pd.DataFrame(cluster, columns=[0,1]) df_clusters.append(df) return df_clusters, TRS TRS=CTRS CTRS=0 print("No solution found in maximum number of Iterations.") df_list, TRS=MultiRC(two_states,'state_0',number_of_clusters,i,patience) for idx, df in enumerate(df_list): df.plot.scatter(0,1, c=colors[idx])