Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
Airbnb Analysis
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Beibei Wang
Airbnb Analysis
Commits
8b5a61c8
Commit
8b5a61c8
authored
3 years ago
by
Beibei Wang
Browse files
Options
Downloads
Patches
Plain Diff
Upload New File
parent
720aacb5
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
Abnormal_reviews.py
+418
-0
418 additions, 0 deletions
Abnormal_reviews.py
with
418 additions
and
0 deletions
Abnormal_reviews.py
0 → 100644
+
418
−
0
View file @
8b5a61c8
#!/usr/bin/env python
# coding: utf-8
# Fake, biased or irrelevant reviews will be removed as Airbnb reviews' rules. In general, most of removed reviews are abnormal.
# We can find abnormal rooms and reviewers from these removed reviews.
# Abnormal rooms/reviewers: the number of removed reviews and percent of removed reviews in total reviews.
# In[1]:
import
matplotlib.pyplot
as
plt
import
numpy
as
np
import
pandas
as
pd
from
collections
import
Counter
from
nltk.corpus
import
stopwords
from
wordcloud
import
WordCloud
,
STOPWORDS
from
nltk.corpus
import
words
import
re
from
nltk
import
word_tokenize
from
vaderSentiment.vaderSentiment
import
SentimentIntensityAnalyzer
# In[2]:
# pip install wordcloud
# pip install vaderSentiment
# In[3]:
# Importing the dataset and add 'city_id' 1:Los Angeles,2:New York, 3:San Francisco
List_LA_02
=
pd
.
read_csv
(
'
Data/listings_Los Angeles_02112021.csv
'
)
List_LA_02
[
'
city_id
'
]
=
1
List_NY_02
=
pd
.
read_csv
(
'
Data/listings_New York_02112021.csv
'
)
List_NY_02
[
'
city_id
'
]
=
2
List_SF_02
=
pd
.
read_csv
(
'
Data/listings_San Francisco_02112021.csv
'
)
List_SF_02
[
'
city_id
'
]
=
3
Listings_02
=
pd
.
concat
([
List_LA_02
,
List_NY_02
,
List_SF_02
],
join
=
"
inner
"
)
List_LA_11
=
pd
.
read_csv
(
'
Data/listings_Los Angeles_02112021.csv
'
)
List_LA_11
[
'
city_id
'
]
=
1
List_NY_11
=
pd
.
read_csv
(
'
Data/listings_New York_02112021.csv
'
)
List_NY_11
[
'
city_id
'
]
=
2
List_SF_11
=
pd
.
read_csv
(
'
Data/listings_San Francisco_02112021.csv
'
)
List_SF_11
[
'
city_id
'
]
=
3
Listings_11
=
pd
.
concat
([
List_LA_11
,
List_NY_11
,
List_SF_11
],
join
=
"
inner
"
)
# In[4]:
print
(
'
The number of rooms in 02.2021:
'
,
len
(
Listings_02
))
print
(
'
The number of rooms in 11.2021:
'
,
len
(
Listings_11
))
# In[5]:
# import the review lists of Los Angeles,New york and San Francisco in 02.2021 and 11.2021,and combine three cities' lists.
# add 'city_id' 1:Los Angeles,2:New York, 3:San Francisco
Reviews_LA_02
=
pd
.
read_csv
(
'
Data/reviews_Los Angeles_04022021.csv
'
)
Reviews_LA_02
[
'
city_id
'
]
=
1
Reviews_NY_02
=
pd
.
read_csv
(
'
Data/reviews_New York_04022021.csv
'
)
Reviews_NY_02
[
'
city_id
'
]
=
2
Reviews_SF_02
=
pd
.
read_csv
(
'
Data/reviews_San Francisco_04022021.csv
'
)
Reviews_SF_02
[
'
city_id
'
]
=
3
Reviews_LA_11
=
pd
.
read_csv
(
'
Data/reviews_Los Angeles_02112021.csv
'
)
Reviews_LA_11
[
'
city_id
'
]
=
1
Reviews_NY_11
=
pd
.
read_csv
(
'
Data/reviews_New York_02112021.csv
'
)
Reviews_NY_11
[
'
city_id
'
]
=
2
Reviews_SF_11
=
pd
.
read_csv
(
'
Data/reviews_San Francisco_02112021.csv
'
)
Reviews_SF_11
[
'
city_id
'
]
=
3
Reviews_02
=
pd
.
concat
([
Reviews_LA_02
,
Reviews_NY_02
,
Reviews_SF_02
],
join
=
"
inner
"
)
Reviews_11
=
pd
.
concat
([
Reviews_LA_11
,
Reviews_NY_11
,
Reviews_SF_11
],
join
=
"
inner
"
)
# In[6]:
def
list_difference
(
l1
:
list
,
l2
:
list
):
l_d
=
[]
l1
.
sort
()
l2
.
sort
()
j
=
0
if
len
(
l1
)
>
len
(
l2
):
return
"
wrong length
"
else
:
l1
.
sort
()
l2
.
sort
()
j
=
0
for
i
in
range
(
0
,
len
(
l1
)):
if
l1
[
i
]
==
l2
[
j
]:
j
=
j
+
1
elif
l2
[
j
]
<
l1
[
i
]:
while
(
l2
[
j
]
<
l1
[
i
]):
j
=
j
+
1
if
l2
[
j
]
>
l1
[
i
]:
l_d
.
append
(
l1
[
i
])
else
:
l_d
.
append
(
l1
[
i
])
return
l_d
# In[7]:
l02
=
list
(
set
(
list
(
Reviews_02
.
reviewer_id
)))
l11
=
list
(
set
(
list
(
Reviews_11
.
reviewer_id
)))
l_dere
=
list_difference
(
l02
,
l11
)
# In[8]:
len
(
l_dere
)
# In[9]:
print
(
'
The number of reviews in 02.2021:
'
,
len
(
Reviews_02
))
print
(
'
The number of reviews in 11.2021:
'
,
len
(
Reviews_11
))
# In[10]:
# function to find the removed reviews which are in 02.2021 dataset but not in 11.2021 dataset.
def
search_removeddata
(
df1
:
pd
.
DataFrame
,
df2
:
pd
.
DataFrame
):
l1
=
list
(
df1
[
'
id
'
])
l2
=
list
(
df2
[
'
id
'
])
l_d
=
list_difference
(
l1
,
l2
)
df_removed_reviews
=
df1
[
df1
[
'
id
'
].
isin
(
l_d
)]
return
df_removed_reviews
# In[11]:
#test and verify algorithm
df_02
=
pd
.
read_csv
(
'
Data/reviews_Salem_022021.csv
'
)
df_12
=
pd
.
read_csv
(
'
Data/reviews_Salem_122021.csv
'
)
df_re
=
pd
.
read_csv
(
'
Data/removed_id.csv
'
)
df_reid
=
search_removeddata
(
df_02
,
df_12
)
if
len
(
df_reid
.
id
)
==
len
(
df_re
.
id
):
if
len
(
df_re
[
df_re
[
'
id
'
].
isin
(
list
(
df_reid
.
id
))].
id
)
==
len
(
df_re
.
id
):
print
(
'
pass
'
)
else
:
print
(
'
wrong
'
)
else
:
print
(
'
wrong
'
)
# In[12]:
# get removed-rooms list.
remove_rooms
=
search_removeddata
(
Listings_02
,
Listings_11
)
remove_rooms
# In[13]:
# get removed-reviews list, there are 242455 removed reviews
remove_reviews
=
search_removeddata
(
Reviews_02
,
Reviews_11
)
remove_reviews
.
to_csv
(
'
removedata.csv
'
)
remove_reviews
.
reset_index
(
drop
=
True
)
# In[14]:
Reviews_02
.
loc
[
Reviews_02
[
'
id
'
].
isin
(
list
(
remove_reviews
.
id
))]
# In[15]:
Reviews_11
.
loc
[
Reviews_11
[
'
id
'
].
isin
(
list
(
remove_reviews
.
id
))]
# In[16]:
#find rooms which have removed reviews in 02.2021 listing
Listings_02
.
loc
[
Listings_02
[
'
id
'
].
isin
(
list
(
remove_reviews
.
listing_id
))]
# In[17]:
#find rooms which have removed reviews in 02.2021 listing
Listings_11
.
loc
[
Listings_11
[
'
id
'
].
isin
(
list
(
remove_reviews
.
listing_id
))]
# In[18]:
#get the dataframe of ture removed reviews
l_re
=
Listings_11
.
loc
[
Listings_11
[
'
id
'
].
isin
(
list
(
remove_reviews
.
listing_id
))].
id
re_review
=
remove_reviews
.
loc
[
remove_reviews
[
'
listing_id
'
].
isin
(
list
(
l_re
))]
re_review
.
reset_index
(
drop
=
True
)
# In[19]:
#find the room which has most moved reviews
from
collections
import
Counter
m_c
=
Counter
(
re_review
.
listing_id
.
to_numpy
()).
most_common
(
10
)
m_c
# In[20]:
l_id
=
[]
l_c
=
[]
for
i
in
range
(
0
,
10
):
l_id
.
append
(
m_c
[
i
][
0
])
l_c
.
append
(
m_c
[
i
][
1
])
# In[21]:
c_re
=
[]
remove_reviews
.
loc
[
remove_reviews
[
'
listing_id
'
].
isin
(
list
(
l_re
))]
for
i
in
l_id
:
c_re
.
append
(
len
(
Reviews_02
.
loc
[
Reviews_02
[
'
listing_id
'
]
==
i
]))
c_re
# In[22]:
p
=
[
round
(
x
/
y
,
2
)
for
x
,
y
in
zip
(
l_c
,
c_re
)]
# In[23]:
unno_dict
=
{
'
id
'
:
l_id
,
'
reviews
'
:
c_re
,
'
removed_reviews
'
:
l_c
,
'
percent
'
:
p
}
df_unno
=
pd
.
DataFrame
(
unno_dict
)
df_unno
.
to_csv
(
'
table3_abnoroom.csv
'
)
df_unno
# In[24]:
abreviewer
=
Counter
(
re_review
.
reviewer_id
.
to_numpy
()).
most_common
(
10
)
abreviewer
# In[25]:
l_r
=
[]
l_rc
=
[]
for
i
in
range
(
0
,
10
):
l_r
.
append
(
abreviewer
[
i
][
0
])
l_rc
.
append
(
abreviewer
[
i
][
1
])
# In[26]:
c
=
[]
for
i
in
l_r
:
c
.
append
(
len
(
Reviews_02
.
loc
[
Reviews_02
[
'
reviewer_id
'
]
==
i
]))
c_abre
=
c
# dere=[]
# for k in l_r:
# if k in l_dere:
# c_abre.append('removed')
# else:
# c_abre.append('')
#
# dere
# In[27]:
pr
=
[
round
(
x
/
y
,
2
)
for
x
,
y
in
zip
(
l_rc
,
c_abre
)]
# In[28]:
len
(
c_abre
)
# In[29]:
abno_dict
=
{
'
reviewer_id
'
:
l_r
,
'
reviews
'
:
c_abre
,
'
removed_reviews
'
:
l_rc
,
'
percent
'
:
pr
}
df_abno
=
pd
.
DataFrame
(
abno_dict
)
df_abno
.
to_csv
(
'
table4_unnoreviewer.csv
'
)
df_abno
# In[30]:
nom_review
=
Reviews_02
.
loc
[
~
Reviews_02
[
'
id
'
].
isin
(
list
(
remove_reviews
.
id
))]
nom_review
=
nom_review
.
sample
(
n
=
len
(
re_review
.
id
),
random_state
=
414747
)
nom_review
[
'
comments
'
]
=
nom_review
[
'
comments
'
].
astype
(
str
)
# In[31]:
def
wordCloudFunction
(
df
,
column
,
numWords
):
text
=
""
for
v
in
df
[
column
]:
text
=
text
+
'
'
+
v
word_string
=
re
.
sub
(
'
([A-Za-z])]]
'
,
'
\\
1
'
,
text
)
wordcloud
=
WordCloud
(
#stopwords=STOPWORDS,
background_color
=
'
white
'
,
max_words
=
numWords
,
width
=
1000
,
height
=
1000
,
).
generate
(
word_string
)
return
wordcloud
# In[32]:
wordcloud1
=
wordCloudFunction
(
re_review
,
'
comments
'
,
100000
)
wordcloud2
=
wordCloudFunction
(
nom_review
,
'
comments
'
,
100000
)
# In[41]:
plt
.
clf
()
plt
.
imshow
(
wordcloud1
)
plt
.
axis
(
'
off
'
)
plt
.
title
(
'
(a) Removed reviews
'
)
plt
.
savefig
(
'
figure3_a.png
'
)
plt
.
show
()
# In[40]:
plt
.
clf
()
plt
.
imshow
(
wordcloud2
)
plt
.
axis
(
'
off
'
)
plt
.
title
(
'
(b) Normal reviews
'
)
plt
.
savefig
(
'
figure3_b.png
'
)
plt
.
show
()
# In[35]:
def
sentiment_analysis
(
df
,
column
):
# Create a SentimentIntensityAnalyzer object.
sid_obj
=
SentimentIntensityAnalyzer
()
sentiment
=
[]
for
review
in
df
[
column
]:
sentiment_dict
=
sid_obj
.
polarity_scores
(
review
)
if
sentiment_dict
[
'
compound
'
]
>=
0.05
:
sentiment
.
append
(
"
Positive
"
)
elif
sentiment_dict
[
'
compound
'
]
<=
-
0.05
:
sentiment
.
append
(
"
Negative
"
)
else
:
sentiment
.
append
(
"
Neutral
"
)
df_senti
=
df
.
copy
()
df_senti
[
'
sentiment
'
]
=
sentiment
return
df_senti
# In[36]:
re_sent
=
sentiment_analysis
(
re_review
,
'
comments
'
)
# In[37]:
nom_sent
=
sentiment_analysis
(
nom_review
,
'
comments
'
)
# In[38]:
# Stacked barplot with pandas
re
=
re_sent
[
'
sentiment
'
].
value_counts
()
nom
=
nom_sent
[
'
sentiment
'
].
value_counts
()
df_plot
=
pd
.
DataFrame
([
re
,
nom
])
df_plot
.
index
=
[
'
Removed reviews
'
,
'
Normal reviews
'
]
# Plot
df_plot
.
plot
(
kind
=
'
barh
'
,
stacked
=
True
,
title
=
'
Sentiment of reviews
'
,
color
=
[
'
#4f8686
'
,
"
#a0c3c3
"
,
'
#d2d2d2
'
])
plt
.
savefig
(
'
figure4.png
'
)
# In[ ]:
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment