Skip to content
Snippets Groups Projects
Commit 38794f92 authored by Beibei Wang's avatar Beibei Wang
Browse files

Upload New File

parent 4c9df5fd
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
# coding: utf-8
# In[1]:
# Importing the required libraries and methods
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from scipy import stats
# In[2]:
# Importing the dataset and add 'city_id' 1:Los Angeles,2:New York, 3:San Francisco
List_LA=pd.read_csv('Data/listings_Los Angeles_02112021.csv')
List_LA['city_id']=1
List_NY=pd.read_csv('Data/listings_New York_02112021.csv')
List_NY['city_id']=2
List_SF=pd.read_csv('Data/listings_San Francisco_02112021.csv')
List_SF['city_id']=3
List_total=pd.concat([List_LA,List_NY,List_SF], join="inner")
# In[3]:
len(List_total)
# In[4]:
# Get gender from 'host_name’ by using the Gender-guesser dictionary.
List_NY_name_gender=List_total[['id','name','host_name','review_scores_rating']]
List_NY_name_gender=List_NY_name_gender.dropna(subset=['review_scores_rating'])
import gender_guesser.detector as gender
d = gender.Detector()
host_name_gender=[]
for i in range (0,len(List_NY_name_gender)):
name=list(List_NY_name_gender['host_name'])[i]
host_name_gender.append(d.get_gender(name))
# In[19]:
table1=List_NY_name_gender.head(5)
table1.to_csv('table1_gender_name.csv')
# In[5]:
List_NY_name_gender['host_name_gender'] = host_name_gender
review_scores_rating_female=List_NY_name_gender.loc[List_NY_name_gender['host_name_gender'] == 'female']['review_scores_rating']
review_scores_rating_male=List_NY_name_gender.loc[List_NY_name_gender['host_name_gender'] == 'male']['review_scores_rating']
review_scores_rating_unknown=List_NY_name_gender.loc[List_NY_name_gender['host_name_gender'] == 'unknown']['review_scores_rating']
# In[6]:
x = np.array(["Female","Male","Unknown"])
y = np.array([len(review_scores_rating_female), len(review_scores_rating_male),len(review_scores_rating_unknown)])
plt.show()
# In[20]:
x = np.array(["Female","Male","Unknown"])
y = np.array([len(review_scores_rating_female), len(review_scores_rating_male),len(review_scores_rating_unknown)])
fig, ax = plt.subplots(figsize=(15,4))
width = 0.5 # the width of the bars
ind = np.arange(len(y)) # the x locations for the groups
ax.barh(ind, y, width, color="#a0c3c3")
ax.set_yticks(ind+width/2)
ax.set_yticklabels(x, minor=False)
for i, v in enumerate(y):
ax.text(v + 3, i + .01, str(v), color='black')
plt.show
plt.savefig('figure1_gender.png')
# df_m=pd.DataFrame(review_scores_rating_male)
# df_m.boxplot()
# In[21]:
# samesize
df_f=pd.DataFrame(review_scores_rating_female)
df_m=pd.DataFrame(review_scores_rating_male)
df_f=df_f.sample(n=len(df_m),random_state=414747)
l_rating_f=list(df_f['review_scores_rating'])
l_rating_m=list(df_m['review_scores_rating'])
data_s=[len(list(filter(lambda x: x <= 2, l_rating_f))),len(list(filter(lambda x: x <= 2, l_rating_m))),
len(list(filter(lambda x: 2< x <= 4, l_rating_f))),len(list(filter(lambda x: 2< x <= 4, l_rating_m))),
len(list(filter(lambda x: 4< x <= 4.5, l_rating_f))),len(list(filter(lambda x: 4< x <= 4.5, l_rating_m))),
len(list(filter(lambda x: 4.5< x <= 4.8, l_rating_f))),len(list(filter(lambda x: 4.5< x <= 4.8, l_rating_m))),
len(list(filter(lambda x: 4.8< x <= 5.0, l_rating_f))),len(list(filter(lambda x: 4.8< x <= 5.0, l_rating_m)))]
data_s=np.reshape(data_s, (5, 2))
df_data_s = pd.DataFrame(data=data_s, columns=['female','male', ],
index=['0.0-2.0','2.0-4.0','4.0-4.5','4.5-4.8','4.9-5,0'])
df_data_s.to_csv('Table2_gender.csv')
df_data_s
# In[9]:
df_data_s.plot.bar(stacked=True, alpha=0.5,color=['#4f8686','#d2d2d2'])
# In[22]:
labels=['0.0-2.0','2.0-4.0','4.0-4.5','4.5-4.8','4.9-5,0']
first=list(df_data_s['female'])
second=list(df_data_s['male'])
x = np.arange(len(labels))
width = 0.25
plt.bar(x - width/2, first, width, label='female',color='#4f8686')
plt.bar(x + width/2, second, width, label='male',color='#d2d2d2')
plt.ylabel('frequency')
plt.title('review_scores_rating')
plt.xticks(x, labels=labels)
plt.legend()
plt.savefig('figure2_gender.png')
# In[28]:
def t_test(x,y,alternative='both-sided'):
_, double_p = stats.ttest_ind(x,y,equal_var = False)
if alternative == 'both-sided':
pval = double_p
elif alternative == 'greater':
if np.mean(x) > np.mean(y):
pval = double_p/2.
else:
pval = 1.0 - double_p/2.
elif alternative == 'less':
if np.mean(x) < np.mean(y):
pval = double_p/2.
else:
pval = 1.0 - double_p/2.
return pval
# In[33]:
#p_value
round(t_test(l_rating_f, l_rating_m,'greater'),6)
# In[31]:
t_test(l_rating_f, l_rating_m,'greater')<0.05
# In[ ]:
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment