Skip to content
Snippets Groups Projects
Commit 8b5a61c8 authored by Beibei Wang's avatar Beibei Wang
Browse files

Upload New File

parent 720aacb5
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
# coding: utf-8
# Fake, biased or irrelevant reviews will be removed as Airbnb reviews' rules. In general, most of removed reviews are abnormal.
# We can find abnormal rooms and reviewers from these removed reviews.
# Abnormal rooms/reviewers: the number of removed reviews and percent of removed reviews in total reviews.
# In[1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import words
import re
from nltk import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# In[2]:
# pip install wordcloud
# pip install vaderSentiment
# In[3]:
# Importing the dataset and add 'city_id' 1:Los Angeles,2:New York, 3:San Francisco
List_LA_02=pd.read_csv('Data/listings_Los Angeles_02112021.csv')
List_LA_02['city_id']=1
List_NY_02=pd.read_csv('Data/listings_New York_02112021.csv')
List_NY_02['city_id']=2
List_SF_02=pd.read_csv('Data/listings_San Francisco_02112021.csv')
List_SF_02['city_id']=3
Listings_02=pd.concat([ List_LA_02,List_NY_02,List_SF_02], join="inner")
List_LA_11=pd.read_csv('Data/listings_Los Angeles_02112021.csv')
List_LA_11['city_id']=1
List_NY_11=pd.read_csv('Data/listings_New York_02112021.csv')
List_NY_11['city_id']=2
List_SF_11=pd.read_csv('Data/listings_San Francisco_02112021.csv')
List_SF_11['city_id']=3
Listings_11=pd.concat([ List_LA_11,List_NY_11,List_SF_11], join="inner")
# In[4]:
print ('The number of rooms in 02.2021:',len(Listings_02))
print ('The number of rooms in 11.2021:',len(Listings_11))
# In[5]:
# import the review lists of Los Angeles,New york and San Francisco in 02.2021 and 11.2021,and combine three cities' lists.
# add 'city_id' 1:Los Angeles,2:New York, 3:San Francisco
Reviews_LA_02=pd.read_csv('Data/reviews_Los Angeles_04022021.csv')
Reviews_LA_02['city_id']=1
Reviews_NY_02=pd.read_csv('Data/reviews_New York_04022021.csv')
Reviews_NY_02['city_id']=2
Reviews_SF_02=pd.read_csv('Data/reviews_San Francisco_04022021.csv')
Reviews_SF_02['city_id']=3
Reviews_LA_11=pd.read_csv('Data/reviews_Los Angeles_02112021.csv')
Reviews_LA_11['city_id']=1
Reviews_NY_11=pd.read_csv('Data/reviews_New York_02112021.csv')
Reviews_NY_11['city_id']=2
Reviews_SF_11=pd.read_csv('Data/reviews_San Francisco_02112021.csv')
Reviews_SF_11['city_id']=3
Reviews_02=pd.concat([Reviews_LA_02,Reviews_NY_02,Reviews_SF_02], join="inner")
Reviews_11=pd.concat([Reviews_LA_11,Reviews_NY_11,Reviews_SF_11], join="inner")
# In[6]:
def list_difference (l1:list,l2:list):
l_d=[]
l1.sort()
l2.sort()
j=0
if len(l1) > len(l2):
return "wrong length"
else:
l1.sort()
l2.sort()
j=0
for i in range (0,len(l1)):
if l1[i]== l2[j]:
j=j+1
elif l2[j]<l1[i]:
while (l2[j]<l1[i]):
j=j+1
if l2[j]>l1[i]:
l_d.append(l1[i])
else:
l_d.append(l1[i])
return l_d
# In[7]:
l02=list(set(list(Reviews_02.reviewer_id)))
l11=list(set(list(Reviews_11.reviewer_id)))
l_dere=list_difference(l02,l11)
# In[8]:
len(l_dere)
# In[9]:
print ('The number of reviews in 02.2021:',len(Reviews_02))
print ('The number of reviews in 11.2021:',len(Reviews_11))
# In[10]:
# function to find the removed reviews which are in 02.2021 dataset but not in 11.2021 dataset.
def search_removeddata(df1: pd.DataFrame, df2: pd.DataFrame):
l1=list(df1['id'])
l2=list(df2['id'])
l_d=list_difference(l1,l2)
df_removed_reviews=df1[df1['id'].isin(l_d)]
return df_removed_reviews
# In[11]:
#test and verify algorithm
df_02=pd.read_csv('Data/reviews_Salem_022021.csv')
df_12=pd.read_csv('Data/reviews_Salem_122021.csv')
df_re=pd.read_csv('Data/removed_id.csv')
df_reid=search_removeddata(df_02,df_12)
if len(df_reid.id)==len(df_re.id):
if len(df_re[df_re['id'].isin(list(df_reid.id))].id)==len(df_re.id):
print('pass')
else:
print('wrong')
else:
print('wrong')
# In[12]:
# get removed-rooms list.
remove_rooms=search_removeddata(Listings_02,Listings_11)
remove_rooms
# In[13]:
# get removed-reviews list, there are 242455 removed reviews
remove_reviews=search_removeddata(Reviews_02,Reviews_11)
remove_reviews.to_csv('removedata.csv')
remove_reviews.reset_index(drop=True)
# In[14]:
Reviews_02.loc[Reviews_02['id'].isin(list(remove_reviews.id))]
# In[15]:
Reviews_11.loc[Reviews_11['id'].isin(list(remove_reviews.id))]
# In[16]:
#find rooms which have removed reviews in 02.2021 listing
Listings_02.loc[Listings_02['id'].isin(list(remove_reviews.listing_id))]
# In[17]:
#find rooms which have removed reviews in 02.2021 listing
Listings_11.loc[Listings_11['id'].isin(list(remove_reviews.listing_id))]
# In[18]:
#get the dataframe of ture removed reviews
l_re=Listings_11.loc[Listings_11['id'].isin(list(remove_reviews.listing_id))].id
re_review=remove_reviews.loc[remove_reviews['listing_id'].isin(list(l_re))]
re_review.reset_index(drop=True)
# In[19]:
#find the room which has most moved reviews
from collections import Counter
m_c=Counter(re_review.listing_id.to_numpy()).most_common(10)
m_c
# In[20]:
l_id=[]
l_c=[]
for i in range(0,10):
l_id.append(m_c[i][0])
l_c.append(m_c[i][1])
# In[21]:
c_re=[]
remove_reviews.loc[remove_reviews['listing_id'].isin(list(l_re))]
for i in l_id:
c_re.append(len(Reviews_02.loc[Reviews_02['listing_id']==i]))
c_re
# In[22]:
p=[round(x/y,2) for x,y in zip(l_c,c_re)]
# In[23]:
unno_dict={'id':l_id,
'reviews':c_re,
'removed_reviews':l_c,
'percent':p}
df_unno=pd.DataFrame(unno_dict)
df_unno.to_csv('table3_abnoroom.csv')
df_unno
# In[24]:
abreviewer=Counter(re_review.reviewer_id.to_numpy()).most_common(10)
abreviewer
# In[25]:
l_r=[]
l_rc=[]
for i in range(0,10):
l_r.append(abreviewer[i][0])
l_rc.append(abreviewer[i][1])
# In[26]:
c=[]
for i in l_r:
c.append(len(Reviews_02.loc[Reviews_02['reviewer_id']==i]))
c_abre=c
# dere=[]
# for k in l_r:
# if k in l_dere:
# c_abre.append('removed')
# else:
# c_abre.append('')
#
# dere
# In[27]:
pr=[round(x/y,2) for x,y in zip(l_rc,c_abre)]
# In[28]:
len(c_abre)
# In[29]:
abno_dict={'reviewer_id':l_r,
'reviews':c_abre,
'removed_reviews':l_rc,
'percent':pr}
df_abno=pd.DataFrame(abno_dict)
df_abno.to_csv('table4_unnoreviewer.csv')
df_abno
# In[30]:
nom_review=Reviews_02.loc[~Reviews_02['id'].isin(list(remove_reviews.id))]
nom_review=nom_review.sample(n=len(re_review.id),random_state=414747)
nom_review['comments']=nom_review['comments'].astype(str)
# In[31]:
def wordCloudFunction(df,column,numWords):
text=""
for v in df[column]:
text=text + ' ' + v
word_string=re.sub('([A-Za-z])]]', '\\1', text)
wordcloud = WordCloud(#stopwords=STOPWORDS,
background_color='white',
max_words=numWords,
width=1000,height=1000,
).generate(word_string)
return wordcloud
# In[32]:
wordcloud1=wordCloudFunction(re_review,'comments',100000)
wordcloud2=wordCloudFunction(nom_review,'comments',100000)
# In[41]:
plt.clf()
plt.imshow(wordcloud1)
plt.axis('off')
plt.title('(a) Removed reviews')
plt.savefig('figure3_a.png')
plt.show()
# In[40]:
plt.clf()
plt.imshow(wordcloud2)
plt.axis('off')
plt.title('(b) Normal reviews')
plt.savefig('figure3_b.png')
plt.show()
# In[35]:
def sentiment_analysis(df,column):
# Create a SentimentIntensityAnalyzer object.
sid_obj = SentimentIntensityAnalyzer()
sentiment=[]
for review in df[column]:
sentiment_dict = sid_obj.polarity_scores(review)
if sentiment_dict['compound'] >= 0.05 :
sentiment.append("Positive")
elif sentiment_dict['compound'] <= - 0.05 :
sentiment.append("Negative")
else :
sentiment.append("Neutral")
df_senti=df.copy()
df_senti['sentiment']=sentiment
return df_senti
# In[36]:
re_sent=sentiment_analysis(re_review,'comments')
# In[37]:
nom_sent=sentiment_analysis(nom_review,'comments')
# In[38]:
# Stacked barplot with pandas
re = re_sent['sentiment'].value_counts()
nom = nom_sent['sentiment'].value_counts()
df_plot = pd.DataFrame([re,nom])
df_plot.index=['Removed reviews','Normal reviews']
# Plot
df_plot.plot(kind='barh',stacked=True, title='Sentiment of reviews',color=['#4f8686',"#a0c3c3",'#d2d2d2'])
plt.savefig('figure4.png')
# In[ ]:
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment