Upload New File

38794f92 · Beibei Wang · 4c9df5fd · 38794f92
Commit 38794f92 authored 3 years ago by Beibei Wang
--- a/Gender_and_Rating.py
+++ b/Gender_and_Rating.py
+#!/usr/bin/env python
+# coding: utf-8
+# In[1]:
+# Importing the required libraries and methods
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import os
+from scipy import stats
+# In[2]:
+# Importing the dataset and add 'city_id' 1:Los Angeles,2:New York, 3:San Francisco
+List_LA=pd.read_csv('Data/listings_Los Angeles_02112021.csv')
+List_LA['city_id']=1
+List_NY=pd.read_csv('Data/listings_New York_02112021.csv')
+List_NY['city_id']=2
+List_SF=pd.read_csv('Data/listings_San Francisco_02112021.csv')
+List_SF['city_id']=3
+List_total=pd.concat([List_LA,List_NY,List_SF], join="inner")
+# In[3]:
+len(List_total)
+# In[4]:
+# Get gender from  'host_name’ by using the Gender-guesser dictionary.
+List_NY_name_gender=List_total[['id','name','host_name','review_scores_rating']]
+List_NY_name_gender=List_NY_name_gender.dropna(subset=['review_scores_rating'])
+import gender_guesser.detector as gender
+d = gender.Detector()
+host_name_gender=[]
+for i in range (0,len(List_NY_name_gender)):
+    name=list(List_NY_name_gender['host_name'])[i]
+    host_name_gender.append(d.get_gender(name))
+# In[19]:
+table1=List_NY_name_gender.head(5)
+table1.to_csv('table1_gender_name.csv')
+# In[5]:
+List_NY_name_gender['host_name_gender'] = host_name_gender
+review_scores_rating_female=List_NY_name_gender.loc[List_NY_name_gender['host_name_gender'] == 'female']['review_scores_rating']
+review_scores_rating_male=List_NY_name_gender.loc[List_NY_name_gender['host_name_gender'] == 'male']['review_scores_rating']
+review_scores_rating_unknown=List_NY_name_gender.loc[List_NY_name_gender['host_name_gender'] == 'unknown']['review_scores_rating']
+# In[6]:
+x = np.array(["Female","Male","Unknown"])
+y = np.array([len(review_scores_rating_female), len(review_scores_rating_male),len(review_scores_rating_unknown)])
+plt.show()
+# In[20]:
+x = np.array(["Female","Male","Unknown"])
+y = np.array([len(review_scores_rating_female), len(review_scores_rating_male),len(review_scores_rating_unknown)])
+fig, ax = plt.subplots(figsize=(15,4))
+width = 0.5 # the width of the bars 
+ind = np.arange(len(y))  # the x locations for the groups
+ax.barh(ind, y, width, color="#a0c3c3")
+ax.set_yticks(ind+width/2)
+ax.set_yticklabels(x, minor=False)
+for i, v in enumerate(y):
+    ax.text(v + 3, i + .01, str(v), color='black')
+plt.show
+plt.savefig('figure1_gender.png')
+# df_m=pd.DataFrame(review_scores_rating_male)
+# df_m.boxplot()
+# In[21]:
+# samesize
+df_f=pd.DataFrame(review_scores_rating_female)
+df_m=pd.DataFrame(review_scores_rating_male)
+df_f=df_f.sample(n=len(df_m),random_state=414747)
+l_rating_f=list(df_f['review_scores_rating'])
+l_rating_m=list(df_m['review_scores_rating'])
+data_s=[len(list(filter(lambda x: x <= 2, l_rating_f))),len(list(filter(lambda x: x <= 2, l_rating_m))),
+        len(list(filter(lambda x: 2< x <= 4, l_rating_f))),len(list(filter(lambda x: 2< x <= 4, l_rating_m))),
+        len(list(filter(lambda x: 4< x <= 4.5, l_rating_f))),len(list(filter(lambda x: 4< x <= 4.5, l_rating_m))),
+        len(list(filter(lambda x: 4.5< x <= 4.8, l_rating_f))),len(list(filter(lambda x: 4.5< x <= 4.8, l_rating_m))),
+        len(list(filter(lambda x: 4.8< x <= 5.0, l_rating_f))),len(list(filter(lambda x: 4.8< x <= 5.0, l_rating_m)))]
+data_s=np.reshape(data_s, (5, 2))
+df_data_s = pd.DataFrame(data=data_s, columns=['female','male', ],
+                   index=['0.0-2.0','2.0-4.0','4.0-4.5','4.5-4.8','4.9-5,0'])
+df_data_s.to_csv('Table2_gender.csv')
+df_data_s
+# In[9]:
+df_data_s.plot.bar(stacked=True, alpha=0.5,color=['#4f8686','#d2d2d2']) 
+# In[22]:
+labels=['0.0-2.0','2.0-4.0','4.0-4.5','4.5-4.8','4.9-5,0']
+first=list(df_data_s['female'])
+second=list(df_data_s['male'])
+x = np.arange(len(labels))
+width = 0.25 
+plt.bar(x - width/2, first, width, label='female',color='#4f8686')
+plt.bar(x + width/2, second, width, label='male',color='#d2d2d2')
+plt.ylabel('frequency')
+plt.title('review_scores_rating')
+plt.xticks(x, labels=labels)
+plt.legend()
+plt.savefig('figure2_gender.png')
+# In[28]:
+def t_test(x,y,alternative='both-sided'):
+    _, double_p = stats.ttest_ind(x,y,equal_var = False)
+    if alternative == 'both-sided':
+        pval = double_p
+    elif alternative == 'greater':
+        if np.mean(x) > np.mean(y):
+            pval = double_p/2.
+        else:
+            pval = 1.0 - double_p/2.
+    elif alternative == 'less':
+        if np.mean(x) < np.mean(y):
+            pval = double_p/2.
+        else:
+            pval = 1.0 - double_p/2.
+    return pval
+# In[33]:
+#p_value
+round(t_test(l_rating_f, l_rating_m,'greater'),6)
+# In[31]:
+t_test(l_rating_f, l_rating_m,'greater')<0.05
+# In[ ]: