Skip to content
Snippets Groups Projects
Commit 4c9df5fd authored by Beibei Wang's avatar Beibei Wang
Browse files

Upload New File

parent 8b5a61c8
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from nltk import word_tokenize
import statsmodels.api as sm
# In[2]:
#import data from Tripadvisor
Tripadvisor_reviews=pd.read_csv('Data/tripadvisor_hotel_reviews.csv')
Tripadvisor_reviews.head(5)
# In[3]:
#Remove rows with non english charaters
Tripadvisor_reviews=Tripadvisor_reviews[Tripadvisor_reviews.Review.map(lambda x: x.isascii())]
Tripadvisor_reviews=Tripadvisor_reviews.reset_index(drop=True)
# In[4]:
#import AFINN lexicon
Afinn=pd.read_csv('Data/AFINN-en.csv')
Afinn_dict=dict(zip(list(Afinn.Words),list(Afinn.Valence)))
#define a function sentiment_analysis
#input a review
#output a dataframe including words, frequency and valence
def sentiment_analysis (review : str):
#Tokenize the review into words:
words = word_tokenize(review)
#Words for each review are matched to the words in the AFINN dictionary
afinn_words=list(Afinn.Words)
clean_text = [token for token in words if (token.lower() in set(afinn_words))]
#get frequency of each word
frequency=dict((x,clean_text.count(x)) for x in set(clean_text))
valence=[]
#get valence of each word
for key in frequency.keys():
valence.append(Afinn_dict.get(key.lower() ))
SA_dict={'Word':list(frequency.keys()),
'Frequency':list(frequency.values()),
'Valence':valence}
df_SA=pd.DataFrame(SA_dict)
return df_SA
# In[5]:
# Output of sentiment analysis for the first review
sentiment_analysis(Tripadvisor_reviews.Review[0])
# In[6]:
#define a function sentiment_analysis
#input a review
#output two values: positive and negative sentiment contributions
def review_S (review : str):
df_SA=sentiment_analysis(review)
FV=[a*b for a,b in zip (list(df_SA.Frequency),list(df_SA.Valence))]
ab_sum=sum([abs(ele) for ele in FV])
if ab_sum==0:
p_S=n_S=0
else:
p_S=sum([x for x in FV if x >= 0])/ab_sum
n_S=sum([x for x in FV if x < 0])/ab_sum
return round(p_S,4),round(n_S,4)
# In[7]:
# Output of positive and negative sentiment contributions for the first review
review_S(Tripadvisor_reviews.Review[0])
# In[8]:
p_S = []
n_S = []
for re in list(Tripadvisor_reviews.Review):
p,n=review_S(re)
p_S.append(p)
n_S.append(n)
Tripadvisor_reviews['S+'] = p_S
Tripadvisor_reviews['S-'] = n_S
Tripadvisor_reviews.head(10)
# In[9]:
X = Tripadvisor_reviews[['S+','S-']]
y = Tripadvisor_reviews['Rating']
# add a constant term to estimate the intercept
X = sm.add_constant(X)
model = sm.OLS(y,X)
results = model.fit()
results.summary()
# In[10]:
#accuracy of predicts: rateing=3.3333+1.4419*(S+)+2.9685*(S-)
predicts = results.predict(X)
count=0
for i in range (len(y)):
if abs(predicts[i]-y[i]) < 1:
count=count+1
accuracy=count/len(y)
accuracy
# In[11]:
Reviews=pd.read_csv('Data/reviews_San Francisco_02112021.csv')
Reviews.head(5)
# In[12]:
len(Reviews)
# In[13]:
#Remove rows with non english charaters
Reviews=Reviews.dropna()
Reviews['comments']=Reviews['comments'].astype(str)
Reviews=Reviews[Reviews.comments.map(lambda x: x.isascii())]
Reviews=Reviews.reset_index(drop=True)
Reviews.head(5)
# In[14]:
sentiment_analysis(Reviews.comments[2019])
# In[15]:
s1,s2=review_S(Reviews.comments[2019])
s1,s2
# In[16]:
Reviews.comments[2019]
# In[17]:
pS = []
nS = []
for r in list(Reviews.comments):
str(r)
ps,ns=review_S(r)
pS.append(ps)
nS.append(ns)
# In[18]:
#Compute rating by 3.3333+1.4419*(S+)+2.9685*(S-) (0.3648,4.7752)
const=[3.3333]*len(pS)
s1=[e*1.4419 for e in pS]+[3.3333]*len(pS)
s2=[e*2.9685 for e in nS]+[3.3333]*len(nS)
#rating=np.sum([const,s1,s2],axis=1)
rating=[i+j+k for i,j,k in zip(const,s1,s2)]
# In[19]:
#append pS nS and rating into Reviews
Reviews['S+'] = pS
Reviews['S-'] = nS
Reviews['rating'] = rating
Reviews.head(10)
# In[20]:
#compute consensus rating for each airbnb room by using median
l_id=list(set(Reviews.listing_id))
con_rating={}
for i in l_id:
med=np.median(list(Reviews.loc[Reviews['listing_id']==i]['rating']))
con_rating[i]=(med-0.3648)/4.7752
con_dict={'id':list(con_rating.keys()),
'consensus_rating':list(con_rating.values())}
df_con=pd.DataFrame(con_dict)
df_con.head(5)
# In[21]:
#import listing data
List=pd.read_csv('Data/listings_San Francisco_02112021.csv')
List_rating=List[['id','review_scores_rating']]
List_rating=List_rating.dropna()
List_rating.head(10)
# In[22]:
List_rating=List_rating.loc[List_rating['id'].isin(l_id)]
List_rating.head(5)
# In[23]:
df_rating=pd.merge(List_rating, df_con, on='id', how='left')
df_rating.head(5)
# In[24]:
import main
cotrip=[]
for i in range (0,len(list(df_rating.review_scores_rating))):
cot=main.compute_contrip(df_rating.review_scores_rating[i], df_rating.consensus_rating[i], 0.5, 10)
cotrip.append(cot)
df_rating['ConTrip_Score']=cotrip
# In[25]:
scale=[]
for sc in cotrip:
scale.append(main.scaling(sc))
df_rating['Scale_Score']=scale
# In[26]:
df_rating.head(5).to_csv('table5_contrip.csv')
# In[27]:
cons=round (df_rating.consensus_rating,1)
rating=round (df_rating.review_scores_rating)
df_f=df_rating
df_f['consensus']=cons
df_f['rating']=rating
#df_f=df_f.loc[df_f['consensus'].isin([0.0,0.2,0.4,0.6,0.8,1.0])]
df_f1=df_f.loc[df_f['rating'].isin([1.0,2.0,3.0,4.0,5.0])]
df_f2=df_f.loc[df_f['consensus'].isin([0.0,0.2,0.4,0.6,0.8,1.0])]
# In[28]:
import seaborn as sns
fig, ax = plt.subplots(figsize=(5, 5))
sns.scatterplot(
data=df_f1,
x="consensus",
y="ConTrip_Score",
hue="rating",
palette=["#d7191c", "#fdae61", "#fee08b", "#abdda4", "#2b83ba"],
)
plt.savefig('figure5_a.png')
# In[29]:
fig, ax = plt.subplots(figsize=(5, 5))
sns.scatterplot(
data=df_f2,
x="rating",
y="ConTrip_Score",
hue="consensus",
palette=["#d7191c", "#fdae61", "#fee08b", "#abdda4", "#2b83ba"],
)
plt.savefig('figure5_b.png')
# In[30]:
fig, ax = plt.subplots(figsize=(5, 5))
sns.scatterplot(
data=df_f1,
x="consensus",
y="Scale_Score",
hue="rating",
palette=["#d7191c", "#fdae61", "#fee08b", "#abdda4", "#2b83ba"],
)
plt.savefig('figure5_c.png')
# In[31]:
fig, ax = plt.subplots(figsize=(5, 5))
sns.scatterplot(
data=df_f2,
x="rating",
y="Scale_Score",
hue="consensus",
palette=["#d7191c", "#fdae61", "#fee08b", "#abdda4", "#2b83ba"],
)
plt.savefig('figure5_d.png')
# In[ ]:
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment