Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
Airbnb Analysis
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Beibei Wang
Airbnb Analysis
Commits
4c9df5fd
Commit
4c9df5fd
authored
3 years ago
by
Beibei Wang
Browse files
Options
Downloads
Patches
Plain Diff
Upload New File
parent
8b5a61c8
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
Contrip.py
+344
-0
344 additions, 0 deletions
Contrip.py
with
344 additions
and
0 deletions
Contrip.py
0 → 100644
+
344
−
0
View file @
4c9df5fd
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import
matplotlib.pyplot
as
plt
import
numpy
as
np
import
pandas
as
pd
from
nltk
import
word_tokenize
import
statsmodels.api
as
sm
# In[2]:
#import data from Tripadvisor
Tripadvisor_reviews
=
pd
.
read_csv
(
'
Data/tripadvisor_hotel_reviews.csv
'
)
Tripadvisor_reviews
.
head
(
5
)
# In[3]:
#Remove rows with non english charaters
Tripadvisor_reviews
=
Tripadvisor_reviews
[
Tripadvisor_reviews
.
Review
.
map
(
lambda
x
:
x
.
isascii
())]
Tripadvisor_reviews
=
Tripadvisor_reviews
.
reset_index
(
drop
=
True
)
# In[4]:
#import AFINN lexicon
Afinn
=
pd
.
read_csv
(
'
Data/AFINN-en.csv
'
)
Afinn_dict
=
dict
(
zip
(
list
(
Afinn
.
Words
),
list
(
Afinn
.
Valence
)))
#define a function sentiment_analysis
#input a review
#output a dataframe including words, frequency and valence
def
sentiment_analysis
(
review
:
str
):
#Tokenize the review into words:
words
=
word_tokenize
(
review
)
#Words for each review are matched to the words in the AFINN dictionary
afinn_words
=
list
(
Afinn
.
Words
)
clean_text
=
[
token
for
token
in
words
if
(
token
.
lower
()
in
set
(
afinn_words
))]
#get frequency of each word
frequency
=
dict
((
x
,
clean_text
.
count
(
x
))
for
x
in
set
(
clean_text
))
valence
=
[]
#get valence of each word
for
key
in
frequency
.
keys
():
valence
.
append
(
Afinn_dict
.
get
(
key
.
lower
()
))
SA_dict
=
{
'
Word
'
:
list
(
frequency
.
keys
()),
'
Frequency
'
:
list
(
frequency
.
values
()),
'
Valence
'
:
valence
}
df_SA
=
pd
.
DataFrame
(
SA_dict
)
return
df_SA
# In[5]:
# Output of sentiment analysis for the first review
sentiment_analysis
(
Tripadvisor_reviews
.
Review
[
0
])
# In[6]:
#define a function sentiment_analysis
#input a review
#output two values: positive and negative sentiment contributions
def
review_S
(
review
:
str
):
df_SA
=
sentiment_analysis
(
review
)
FV
=
[
a
*
b
for
a
,
b
in
zip
(
list
(
df_SA
.
Frequency
),
list
(
df_SA
.
Valence
))]
ab_sum
=
sum
([
abs
(
ele
)
for
ele
in
FV
])
if
ab_sum
==
0
:
p_S
=
n_S
=
0
else
:
p_S
=
sum
([
x
for
x
in
FV
if
x
>=
0
])
/
ab_sum
n_S
=
sum
([
x
for
x
in
FV
if
x
<
0
])
/
ab_sum
return
round
(
p_S
,
4
),
round
(
n_S
,
4
)
# In[7]:
# Output of positive and negative sentiment contributions for the first review
review_S
(
Tripadvisor_reviews
.
Review
[
0
])
# In[8]:
p_S
=
[]
n_S
=
[]
for
re
in
list
(
Tripadvisor_reviews
.
Review
):
p
,
n
=
review_S
(
re
)
p_S
.
append
(
p
)
n_S
.
append
(
n
)
Tripadvisor_reviews
[
'
S+
'
]
=
p_S
Tripadvisor_reviews
[
'
S-
'
]
=
n_S
Tripadvisor_reviews
.
head
(
10
)
# In[9]:
X
=
Tripadvisor_reviews
[[
'
S+
'
,
'
S-
'
]]
y
=
Tripadvisor_reviews
[
'
Rating
'
]
# add a constant term to estimate the intercept
X
=
sm
.
add_constant
(
X
)
model
=
sm
.
OLS
(
y
,
X
)
results
=
model
.
fit
()
results
.
summary
()
# In[10]:
#accuracy of predicts: rateing=3.3333+1.4419*(S+)+2.9685*(S-)
predicts
=
results
.
predict
(
X
)
count
=
0
for
i
in
range
(
len
(
y
)):
if
abs
(
predicts
[
i
]
-
y
[
i
])
<
1
:
count
=
count
+
1
accuracy
=
count
/
len
(
y
)
accuracy
# In[11]:
Reviews
=
pd
.
read_csv
(
'
Data/reviews_San Francisco_02112021.csv
'
)
Reviews
.
head
(
5
)
# In[12]:
len
(
Reviews
)
# In[13]:
#Remove rows with non english charaters
Reviews
=
Reviews
.
dropna
()
Reviews
[
'
comments
'
]
=
Reviews
[
'
comments
'
].
astype
(
str
)
Reviews
=
Reviews
[
Reviews
.
comments
.
map
(
lambda
x
:
x
.
isascii
())]
Reviews
=
Reviews
.
reset_index
(
drop
=
True
)
Reviews
.
head
(
5
)
# In[14]:
sentiment_analysis
(
Reviews
.
comments
[
2019
])
# In[15]:
s1
,
s2
=
review_S
(
Reviews
.
comments
[
2019
])
s1
,
s2
# In[16]:
Reviews
.
comments
[
2019
]
# In[17]:
pS
=
[]
nS
=
[]
for
r
in
list
(
Reviews
.
comments
):
str
(
r
)
ps
,
ns
=
review_S
(
r
)
pS
.
append
(
ps
)
nS
.
append
(
ns
)
# In[18]:
#Compute rating by 3.3333+1.4419*(S+)+2.9685*(S-) (0.3648,4.7752)
const
=
[
3.3333
]
*
len
(
pS
)
s1
=
[
e
*
1.4419
for
e
in
pS
]
+
[
3.3333
]
*
len
(
pS
)
s2
=
[
e
*
2.9685
for
e
in
nS
]
+
[
3.3333
]
*
len
(
nS
)
#rating=np.sum([const,s1,s2],axis=1)
rating
=
[
i
+
j
+
k
for
i
,
j
,
k
in
zip
(
const
,
s1
,
s2
)]
# In[19]:
#append pS nS and rating into Reviews
Reviews
[
'
S+
'
]
=
pS
Reviews
[
'
S-
'
]
=
nS
Reviews
[
'
rating
'
]
=
rating
Reviews
.
head
(
10
)
# In[20]:
#compute consensus rating for each airbnb room by using median
l_id
=
list
(
set
(
Reviews
.
listing_id
))
con_rating
=
{}
for
i
in
l_id
:
med
=
np
.
median
(
list
(
Reviews
.
loc
[
Reviews
[
'
listing_id
'
]
==
i
][
'
rating
'
]))
con_rating
[
i
]
=
(
med
-
0.3648
)
/
4.7752
con_dict
=
{
'
id
'
:
list
(
con_rating
.
keys
()),
'
consensus_rating
'
:
list
(
con_rating
.
values
())}
df_con
=
pd
.
DataFrame
(
con_dict
)
df_con
.
head
(
5
)
# In[21]:
#import listing data
List
=
pd
.
read_csv
(
'
Data/listings_San Francisco_02112021.csv
'
)
List_rating
=
List
[[
'
id
'
,
'
review_scores_rating
'
]]
List_rating
=
List_rating
.
dropna
()
List_rating
.
head
(
10
)
# In[22]:
List_rating
=
List_rating
.
loc
[
List_rating
[
'
id
'
].
isin
(
l_id
)]
List_rating
.
head
(
5
)
# In[23]:
df_rating
=
pd
.
merge
(
List_rating
,
df_con
,
on
=
'
id
'
,
how
=
'
left
'
)
df_rating
.
head
(
5
)
# In[24]:
import
main
cotrip
=
[]
for
i
in
range
(
0
,
len
(
list
(
df_rating
.
review_scores_rating
))):
cot
=
main
.
compute_contrip
(
df_rating
.
review_scores_rating
[
i
],
df_rating
.
consensus_rating
[
i
],
0.5
,
10
)
cotrip
.
append
(
cot
)
df_rating
[
'
ConTrip_Score
'
]
=
cotrip
# In[25]:
scale
=
[]
for
sc
in
cotrip
:
scale
.
append
(
main
.
scaling
(
sc
))
df_rating
[
'
Scale_Score
'
]
=
scale
# In[26]:
df_rating
.
head
(
5
).
to_csv
(
'
table5_contrip.csv
'
)
# In[27]:
cons
=
round
(
df_rating
.
consensus_rating
,
1
)
rating
=
round
(
df_rating
.
review_scores_rating
)
df_f
=
df_rating
df_f
[
'
consensus
'
]
=
cons
df_f
[
'
rating
'
]
=
rating
#df_f=df_f.loc[df_f['consensus'].isin([0.0,0.2,0.4,0.6,0.8,1.0])]
df_f1
=
df_f
.
loc
[
df_f
[
'
rating
'
].
isin
([
1.0
,
2.0
,
3.0
,
4.0
,
5.0
])]
df_f2
=
df_f
.
loc
[
df_f
[
'
consensus
'
].
isin
([
0.0
,
0.2
,
0.4
,
0.6
,
0.8
,
1.0
])]
# In[28]:
import
seaborn
as
sns
fig
,
ax
=
plt
.
subplots
(
figsize
=
(
5
,
5
))
sns
.
scatterplot
(
data
=
df_f1
,
x
=
"
consensus
"
,
y
=
"
ConTrip_Score
"
,
hue
=
"
rating
"
,
palette
=
[
"
#d7191c
"
,
"
#fdae61
"
,
"
#fee08b
"
,
"
#abdda4
"
,
"
#2b83ba
"
],
)
plt
.
savefig
(
'
figure5_a.png
'
)
# In[29]:
fig
,
ax
=
plt
.
subplots
(
figsize
=
(
5
,
5
))
sns
.
scatterplot
(
data
=
df_f2
,
x
=
"
rating
"
,
y
=
"
ConTrip_Score
"
,
hue
=
"
consensus
"
,
palette
=
[
"
#d7191c
"
,
"
#fdae61
"
,
"
#fee08b
"
,
"
#abdda4
"
,
"
#2b83ba
"
],
)
plt
.
savefig
(
'
figure5_b.png
'
)
# In[30]:
fig
,
ax
=
plt
.
subplots
(
figsize
=
(
5
,
5
))
sns
.
scatterplot
(
data
=
df_f1
,
x
=
"
consensus
"
,
y
=
"
Scale_Score
"
,
hue
=
"
rating
"
,
palette
=
[
"
#d7191c
"
,
"
#fdae61
"
,
"
#fee08b
"
,
"
#abdda4
"
,
"
#2b83ba
"
],
)
plt
.
savefig
(
'
figure5_c.png
'
)
# In[31]:
fig
,
ax
=
plt
.
subplots
(
figsize
=
(
5
,
5
))
sns
.
scatterplot
(
data
=
df_f2
,
x
=
"
rating
"
,
y
=
"
Scale_Score
"
,
hue
=
"
consensus
"
,
palette
=
[
"
#d7191c
"
,
"
#fdae61
"
,
"
#fee08b
"
,
"
#abdda4
"
,
"
#2b83ba
"
],
)
plt
.
savefig
(
'
figure5_d.png
'
)
# In[ ]:
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment