diff --git a/Abnormal_reviews.py b/Abnormal_reviews.py new file mode 100644 index 0000000000000000000000000000000000000000..159bd5eb231ca32df69a6064c0cb65cea178777d --- /dev/null +++ b/Abnormal_reviews.py @@ -0,0 +1,418 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Fake, biased or irrelevant reviews will be removed as Airbnb reviews' rules. In general, most of removed reviews are abnormal. +# We can find abnormal rooms and reviewers from these removed reviews. +# Abnormal rooms/reviewers: the number of removed reviews and percent of removed reviews in total reviews. + +# In[1]: + + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from collections import Counter +from nltk.corpus import stopwords +from wordcloud import WordCloud, STOPWORDS +from nltk.corpus import words +import re +from nltk import word_tokenize +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer + + +# In[2]: + + +# pip install wordcloud +# pip install vaderSentiment + + +# In[3]: + + +# Importing the dataset and add 'city_id' 1:Los Angeles,2:New York, 3:San Francisco + +List_LA_02=pd.read_csv('Data/listings_Los Angeles_02112021.csv') +List_LA_02['city_id']=1 +List_NY_02=pd.read_csv('Data/listings_New York_02112021.csv') +List_NY_02['city_id']=2 +List_SF_02=pd.read_csv('Data/listings_San Francisco_02112021.csv') +List_SF_02['city_id']=3 +Listings_02=pd.concat([ List_LA_02,List_NY_02,List_SF_02], join="inner") +List_LA_11=pd.read_csv('Data/listings_Los Angeles_02112021.csv') +List_LA_11['city_id']=1 +List_NY_11=pd.read_csv('Data/listings_New York_02112021.csv') +List_NY_11['city_id']=2 +List_SF_11=pd.read_csv('Data/listings_San Francisco_02112021.csv') +List_SF_11['city_id']=3 +Listings_11=pd.concat([ List_LA_11,List_NY_11,List_SF_11], join="inner") + + +# In[4]: + + +print ('The number of rooms in 02.2021:',len(Listings_02)) +print ('The number of rooms in 11.2021:',len(Listings_11)) + + +# In[5]: + + +# import the review lists of Los Angeles,New york and San Francisco in 02.2021 and 11.2021,and combine three cities' lists. +# add 'city_id' 1:Los Angeles,2:New York, 3:San Francisco +Reviews_LA_02=pd.read_csv('Data/reviews_Los Angeles_04022021.csv') +Reviews_LA_02['city_id']=1 +Reviews_NY_02=pd.read_csv('Data/reviews_New York_04022021.csv') +Reviews_NY_02['city_id']=2 +Reviews_SF_02=pd.read_csv('Data/reviews_San Francisco_04022021.csv') +Reviews_SF_02['city_id']=3 +Reviews_LA_11=pd.read_csv('Data/reviews_Los Angeles_02112021.csv') +Reviews_LA_11['city_id']=1 +Reviews_NY_11=pd.read_csv('Data/reviews_New York_02112021.csv') +Reviews_NY_11['city_id']=2 +Reviews_SF_11=pd.read_csv('Data/reviews_San Francisco_02112021.csv') +Reviews_SF_11['city_id']=3 +Reviews_02=pd.concat([Reviews_LA_02,Reviews_NY_02,Reviews_SF_02], join="inner") +Reviews_11=pd.concat([Reviews_LA_11,Reviews_NY_11,Reviews_SF_11], join="inner") + + +# In[6]: + + +def list_difference (l1:list,l2:list): + l_d=[] + l1.sort() + l2.sort() + j=0 + if len(l1) > len(l2): + return "wrong length" + else: + l1.sort() + l2.sort() + j=0 + for i in range (0,len(l1)): + if l1[i]== l2[j]: + j=j+1 + elif l2[j]<l1[i]: + while (l2[j]<l1[i]): + j=j+1 + if l2[j]>l1[i]: + l_d.append(l1[i]) + else: + l_d.append(l1[i]) + + return l_d + + +# In[7]: + + +l02=list(set(list(Reviews_02.reviewer_id))) +l11=list(set(list(Reviews_11.reviewer_id))) +l_dere=list_difference(l02,l11) + + +# In[8]: + + +len(l_dere) + + +# In[9]: + + +print ('The number of reviews in 02.2021:',len(Reviews_02)) +print ('The number of reviews in 11.2021:',len(Reviews_11)) + + +# In[10]: + + +# function to find the removed reviews which are in 02.2021 dataset but not in 11.2021 dataset. +def search_removeddata(df1: pd.DataFrame, df2: pd.DataFrame): + l1=list(df1['id']) + l2=list(df2['id']) + l_d=list_difference(l1,l2) + df_removed_reviews=df1[df1['id'].isin(l_d)] + return df_removed_reviews + + +# In[11]: + + +#test and verify algorithm +df_02=pd.read_csv('Data/reviews_Salem_022021.csv') +df_12=pd.read_csv('Data/reviews_Salem_122021.csv') +df_re=pd.read_csv('Data/removed_id.csv') +df_reid=search_removeddata(df_02,df_12) +if len(df_reid.id)==len(df_re.id): + if len(df_re[df_re['id'].isin(list(df_reid.id))].id)==len(df_re.id): + print('pass') + else: + print('wrong') +else: + print('wrong') + + +# In[12]: + + +# get removed-rooms list. +remove_rooms=search_removeddata(Listings_02,Listings_11) +remove_rooms + + +# In[13]: + + +# get removed-reviews list, there are 242455 removed reviews +remove_reviews=search_removeddata(Reviews_02,Reviews_11) +remove_reviews.to_csv('removedata.csv') +remove_reviews.reset_index(drop=True) + + +# In[14]: + + +Reviews_02.loc[Reviews_02['id'].isin(list(remove_reviews.id))] + + +# In[15]: + + +Reviews_11.loc[Reviews_11['id'].isin(list(remove_reviews.id))] + + +# In[16]: + + +#find rooms which have removed reviews in 02.2021 listing +Listings_02.loc[Listings_02['id'].isin(list(remove_reviews.listing_id))] + + +# In[17]: + + +#find rooms which have removed reviews in 02.2021 listing +Listings_11.loc[Listings_11['id'].isin(list(remove_reviews.listing_id))] + + +# In[18]: + + +#get the dataframe of ture removed reviews +l_re=Listings_11.loc[Listings_11['id'].isin(list(remove_reviews.listing_id))].id +re_review=remove_reviews.loc[remove_reviews['listing_id'].isin(list(l_re))] +re_review.reset_index(drop=True) + + +# In[19]: + + +#find the room which has most moved reviews +from collections import Counter +m_c=Counter(re_review.listing_id.to_numpy()).most_common(10) +m_c + + +# In[20]: + + +l_id=[] +l_c=[] +for i in range(0,10): + l_id.append(m_c[i][0]) + l_c.append(m_c[i][1]) + + +# In[21]: + + +c_re=[] +remove_reviews.loc[remove_reviews['listing_id'].isin(list(l_re))] +for i in l_id: + c_re.append(len(Reviews_02.loc[Reviews_02['listing_id']==i])) +c_re + + +# In[22]: + + +p=[round(x/y,2) for x,y in zip(l_c,c_re)] + + +# In[23]: + + +unno_dict={'id':l_id, + 'reviews':c_re, + 'removed_reviews':l_c, + 'percent':p} +df_unno=pd.DataFrame(unno_dict) +df_unno.to_csv('table3_abnoroom.csv') +df_unno + + +# In[24]: + + +abreviewer=Counter(re_review.reviewer_id.to_numpy()).most_common(10) +abreviewer + + +# In[25]: + + +l_r=[] +l_rc=[] +for i in range(0,10): + l_r.append(abreviewer[i][0]) + l_rc.append(abreviewer[i][1]) + + +# In[26]: + + +c=[] +for i in l_r: + c.append(len(Reviews_02.loc[Reviews_02['reviewer_id']==i])) +c_abre=c + + +# dere=[] +# for k in l_r: +# if k in l_dere: +# c_abre.append('removed') +# else: +# c_abre.append('') +# +# dere + +# In[27]: + + +pr=[round(x/y,2) for x,y in zip(l_rc,c_abre)] + + +# In[28]: + + +len(c_abre) + + +# In[29]: + + +abno_dict={'reviewer_id':l_r, + 'reviews':c_abre, + 'removed_reviews':l_rc, + 'percent':pr} +df_abno=pd.DataFrame(abno_dict) +df_abno.to_csv('table4_unnoreviewer.csv') +df_abno + + +# In[30]: + + +nom_review=Reviews_02.loc[~Reviews_02['id'].isin(list(remove_reviews.id))] +nom_review=nom_review.sample(n=len(re_review.id),random_state=414747) +nom_review['comments']=nom_review['comments'].astype(str) + + +# In[31]: + + +def wordCloudFunction(df,column,numWords): + text="" + for v in df[column]: + text=text + ' ' + v + word_string=re.sub('([A-Za-z])]]', '\\1', text) + wordcloud = WordCloud(#stopwords=STOPWORDS, + background_color='white', + max_words=numWords, + width=1000,height=1000, + ).generate(word_string) + return wordcloud + + +# In[32]: + + +wordcloud1=wordCloudFunction(re_review,'comments',100000) +wordcloud2=wordCloudFunction(nom_review,'comments',100000) + + +# In[41]: + + +plt.clf() +plt.imshow(wordcloud1) +plt.axis('off') +plt.title('(a) Removed reviews') +plt.savefig('figure3_a.png') +plt.show() + + +# In[40]: + + +plt.clf() +plt.imshow(wordcloud2) +plt.axis('off') +plt.title('(b) Normal reviews') +plt.savefig('figure3_b.png') +plt.show() + + +# In[35]: + + +def sentiment_analysis(df,column): + # Create a SentimentIntensityAnalyzer object. + sid_obj = SentimentIntensityAnalyzer() + sentiment=[] + for review in df[column]: + sentiment_dict = sid_obj.polarity_scores(review) + if sentiment_dict['compound'] >= 0.05 : + sentiment.append("Positive") + + elif sentiment_dict['compound'] <= - 0.05 : + sentiment.append("Negative") + else : + sentiment.append("Neutral") + df_senti=df.copy() + df_senti['sentiment']=sentiment + return df_senti + + +# In[36]: + + +re_sent=sentiment_analysis(re_review,'comments') + + +# In[37]: + + +nom_sent=sentiment_analysis(nom_review,'comments') + + +# In[38]: + + +# Stacked barplot with pandas +re = re_sent['sentiment'].value_counts() +nom = nom_sent['sentiment'].value_counts() +df_plot = pd.DataFrame([re,nom]) +df_plot.index=['Removed reviews','Normal reviews'] +# Plot +df_plot.plot(kind='barh',stacked=True, title='Sentiment of reviews',color=['#4f8686',"#a0c3c3",'#d2d2d2']) +plt.savefig('figure4.png') + + +# In[ ]: + + + +