diff --git a/Contrip.py b/Contrip.py new file mode 100644 index 0000000000000000000000000000000000000000..13c9662e9150c9d2c891afd92b31359a9005286f --- /dev/null +++ b/Contrip.py @@ -0,0 +1,344 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[1]: + + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from nltk import word_tokenize +import statsmodels.api as sm + + +# In[2]: + + +#import data from Tripadvisor +Tripadvisor_reviews=pd.read_csv('Data/tripadvisor_hotel_reviews.csv') +Tripadvisor_reviews.head(5) + + +# In[3]: + + +#Remove rows with non english charaters +Tripadvisor_reviews=Tripadvisor_reviews[Tripadvisor_reviews.Review.map(lambda x: x.isascii())] +Tripadvisor_reviews=Tripadvisor_reviews.reset_index(drop=True) + + +# In[4]: + + +#import AFINN lexicon +Afinn=pd.read_csv('Data/AFINN-en.csv') +Afinn_dict=dict(zip(list(Afinn.Words),list(Afinn.Valence))) +#define a function sentiment_analysis +#input a review +#output a dataframe including words, frequency and valence +def sentiment_analysis (review : str): + #Tokenize the review into words: + words = word_tokenize(review) + #Words for each review are matched to the words in the AFINN dictionary + afinn_words=list(Afinn.Words) + clean_text = [token for token in words if (token.lower() in set(afinn_words))] + #get frequency of each word + frequency=dict((x,clean_text.count(x)) for x in set(clean_text)) + valence=[] + #get valence of each word + for key in frequency.keys(): + valence.append(Afinn_dict.get(key.lower() )) + SA_dict={'Word':list(frequency.keys()), + 'Frequency':list(frequency.values()), + 'Valence':valence} + df_SA=pd.DataFrame(SA_dict) + return df_SA + + +# In[5]: + + +# Output of sentiment analysis for the first review +sentiment_analysis(Tripadvisor_reviews.Review[0]) + + +# In[6]: + + +#define a function sentiment_analysis +#input a review +#output two values: positive and negative sentiment contributions +def review_S (review : str): + df_SA=sentiment_analysis(review) + FV=[a*b for a,b in zip (list(df_SA.Frequency),list(df_SA.Valence))] + ab_sum=sum([abs(ele) for ele in FV]) + if ab_sum==0: + p_S=n_S=0 + else: + p_S=sum([x for x in FV if x >= 0])/ab_sum + n_S=sum([x for x in FV if x < 0])/ab_sum + return round(p_S,4),round(n_S,4) + + +# In[7]: + + +# Output of positive and negative sentiment contributions for the first review +review_S(Tripadvisor_reviews.Review[0]) + + +# In[8]: + + +p_S = [] +n_S = [] +for re in list(Tripadvisor_reviews.Review): + p,n=review_S(re) + p_S.append(p) + n_S.append(n) +Tripadvisor_reviews['S+'] = p_S +Tripadvisor_reviews['S-'] = n_S +Tripadvisor_reviews.head(10) + + +# In[9]: + + +X = Tripadvisor_reviews[['S+','S-']] +y = Tripadvisor_reviews['Rating'] +# add a constant term to estimate the intercept +X = sm.add_constant(X) +model = sm.OLS(y,X) +results = model.fit() +results.summary() + + +# In[10]: + + +#accuracy of predicts: rateing=3.3333+1.4419*(S+)+2.9685*(S-) +predicts = results.predict(X) +count=0 +for i in range (len(y)): + if abs(predicts[i]-y[i]) < 1: + count=count+1 +accuracy=count/len(y) +accuracy + + +# In[11]: + + +Reviews=pd.read_csv('Data/reviews_San Francisco_02112021.csv') +Reviews.head(5) + + +# In[12]: + + +len(Reviews) + + +# In[13]: + + +#Remove rows with non english charaters +Reviews=Reviews.dropna() +Reviews['comments']=Reviews['comments'].astype(str) +Reviews=Reviews[Reviews.comments.map(lambda x: x.isascii())] +Reviews=Reviews.reset_index(drop=True) +Reviews.head(5) + + +# In[14]: + + +sentiment_analysis(Reviews.comments[2019]) + + +# In[15]: + + +s1,s2=review_S(Reviews.comments[2019]) +s1,s2 + + +# In[16]: + + +Reviews.comments[2019] + + +# In[17]: + + +pS = [] +nS = [] +for r in list(Reviews.comments): + str(r) + ps,ns=review_S(r) + pS.append(ps) + nS.append(ns) + + +# In[18]: + + +#Compute rating by 3.3333+1.4419*(S+)+2.9685*(S-) (0.3648,4.7752) +const=[3.3333]*len(pS) +s1=[e*1.4419 for e in pS]+[3.3333]*len(pS) +s2=[e*2.9685 for e in nS]+[3.3333]*len(nS) +#rating=np.sum([const,s1,s2],axis=1) +rating=[i+j+k for i,j,k in zip(const,s1,s2)] + + +# In[19]: + + +#append pS nS and rating into Reviews +Reviews['S+'] = pS +Reviews['S-'] = nS +Reviews['rating'] = rating +Reviews.head(10) + + +# In[20]: + + +#compute consensus rating for each airbnb room by using median +l_id=list(set(Reviews.listing_id)) +con_rating={} +for i in l_id: + med=np.median(list(Reviews.loc[Reviews['listing_id']==i]['rating'])) + con_rating[i]=(med-0.3648)/4.7752 +con_dict={'id':list(con_rating.keys()), + 'consensus_rating':list(con_rating.values())} +df_con=pd.DataFrame(con_dict) +df_con.head(5) + + +# In[21]: + + +#import listing data +List=pd.read_csv('Data/listings_San Francisco_02112021.csv') +List_rating=List[['id','review_scores_rating']] +List_rating=List_rating.dropna() +List_rating.head(10) + + +# In[22]: + + +List_rating=List_rating.loc[List_rating['id'].isin(l_id)] +List_rating.head(5) + + +# In[23]: + + +df_rating=pd.merge(List_rating, df_con, on='id', how='left') +df_rating.head(5) + + +# In[24]: + + +import main +cotrip=[] +for i in range (0,len(list(df_rating.review_scores_rating))): + cot=main.compute_contrip(df_rating.review_scores_rating[i], df_rating.consensus_rating[i], 0.5, 10) + cotrip.append(cot) +df_rating['ConTrip_Score']=cotrip + + +# In[25]: + + +scale=[] +for sc in cotrip: + scale.append(main.scaling(sc)) +df_rating['Scale_Score']=scale + + +# In[26]: + + +df_rating.head(5).to_csv('table5_contrip.csv') + + +# In[27]: + + +cons=round (df_rating.consensus_rating,1) +rating=round (df_rating.review_scores_rating) +df_f=df_rating +df_f['consensus']=cons +df_f['rating']=rating +#df_f=df_f.loc[df_f['consensus'].isin([0.0,0.2,0.4,0.6,0.8,1.0])] +df_f1=df_f.loc[df_f['rating'].isin([1.0,2.0,3.0,4.0,5.0])] +df_f2=df_f.loc[df_f['consensus'].isin([0.0,0.2,0.4,0.6,0.8,1.0])] + + +# In[28]: + + +import seaborn as sns +fig, ax = plt.subplots(figsize=(5, 5)) +sns.scatterplot( + data=df_f1, + x="consensus", + y="ConTrip_Score", + hue="rating", + palette=["#d7191c", "#fdae61", "#fee08b", "#abdda4", "#2b83ba"], + ) +plt.savefig('figure5_a.png') + + +# In[29]: + + +fig, ax = plt.subplots(figsize=(5, 5)) +sns.scatterplot( + data=df_f2, + x="rating", + y="ConTrip_Score", + hue="consensus", + palette=["#d7191c", "#fdae61", "#fee08b", "#abdda4", "#2b83ba"], + ) +plt.savefig('figure5_b.png') + + +# In[30]: + + +fig, ax = plt.subplots(figsize=(5, 5)) +sns.scatterplot( + data=df_f1, + x="consensus", + y="Scale_Score", + hue="rating", + palette=["#d7191c", "#fdae61", "#fee08b", "#abdda4", "#2b83ba"], + ) +plt.savefig('figure5_c.png') + + +# In[31]: + + +fig, ax = plt.subplots(figsize=(5, 5)) +sns.scatterplot( + data=df_f2, + x="rating", + y="Scale_Score", + hue="consensus", + palette=["#d7191c", "#fdae61", "#fee08b", "#abdda4", "#2b83ba"], + ) +plt.savefig('figure5_d.png') + + +# In[ ]: + + + +