From 38794f92a55973b57fcebaced8a4cc8b7519ce34 Mon Sep 17 00:00:00 2001 From: Beibei Wang <beibei.wang1@rwth-aachen.de> Date: Fri, 28 Jan 2022 08:55:25 +0100 Subject: [PATCH] Upload New File --- Gender_and_Rating.py | 173 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 Gender_and_Rating.py diff --git a/Gender_and_Rating.py b/Gender_and_Rating.py new file mode 100644 index 0000000..dcd0fc9 --- /dev/null +++ b/Gender_and_Rating.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[1]: + + +# Importing the required libraries and methods +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import os +from scipy import stats + + +# In[2]: + + +# Importing the dataset and add 'city_id' 1:Los Angeles,2:New York, 3:San Francisco +List_LA=pd.read_csv('Data/listings_Los Angeles_02112021.csv') +List_LA['city_id']=1 +List_NY=pd.read_csv('Data/listings_New York_02112021.csv') +List_NY['city_id']=2 +List_SF=pd.read_csv('Data/listings_San Francisco_02112021.csv') +List_SF['city_id']=3 +List_total=pd.concat([List_LA,List_NY,List_SF], join="inner") + + +# In[3]: + + +len(List_total) + + +# In[4]: + + +# Get gender from 'host_name’ by using the Gender-guesser dictionary. +List_NY_name_gender=List_total[['id','name','host_name','review_scores_rating']] +List_NY_name_gender=List_NY_name_gender.dropna(subset=['review_scores_rating']) +import gender_guesser.detector as gender +d = gender.Detector() +host_name_gender=[] +for i in range (0,len(List_NY_name_gender)): + name=list(List_NY_name_gender['host_name'])[i] + host_name_gender.append(d.get_gender(name)) + + +# In[19]: + + +table1=List_NY_name_gender.head(5) +table1.to_csv('table1_gender_name.csv') + + +# In[5]: + + +List_NY_name_gender['host_name_gender'] = host_name_gender +review_scores_rating_female=List_NY_name_gender.loc[List_NY_name_gender['host_name_gender'] == 'female']['review_scores_rating'] +review_scores_rating_male=List_NY_name_gender.loc[List_NY_name_gender['host_name_gender'] == 'male']['review_scores_rating'] +review_scores_rating_unknown=List_NY_name_gender.loc[List_NY_name_gender['host_name_gender'] == 'unknown']['review_scores_rating'] + + +# In[6]: + + +x = np.array(["Female","Male","Unknown"]) +y = np.array([len(review_scores_rating_female), len(review_scores_rating_male),len(review_scores_rating_unknown)]) +plt.show() + + +# In[20]: + + +x = np.array(["Female","Male","Unknown"]) +y = np.array([len(review_scores_rating_female), len(review_scores_rating_male),len(review_scores_rating_unknown)]) +fig, ax = plt.subplots(figsize=(15,4)) +width = 0.5 # the width of the bars +ind = np.arange(len(y)) # the x locations for the groups +ax.barh(ind, y, width, color="#a0c3c3") +ax.set_yticks(ind+width/2) +ax.set_yticklabels(x, minor=False) +for i, v in enumerate(y): + ax.text(v + 3, i + .01, str(v), color='black') +plt.show +plt.savefig('figure1_gender.png') + + +# df_m=pd.DataFrame(review_scores_rating_male) +# df_m.boxplot() + +# In[21]: + + +# samesize +df_f=pd.DataFrame(review_scores_rating_female) +df_m=pd.DataFrame(review_scores_rating_male) +df_f=df_f.sample(n=len(df_m),random_state=414747) +l_rating_f=list(df_f['review_scores_rating']) +l_rating_m=list(df_m['review_scores_rating']) +data_s=[len(list(filter(lambda x: x <= 2, l_rating_f))),len(list(filter(lambda x: x <= 2, l_rating_m))), + len(list(filter(lambda x: 2< x <= 4, l_rating_f))),len(list(filter(lambda x: 2< x <= 4, l_rating_m))), + len(list(filter(lambda x: 4< x <= 4.5, l_rating_f))),len(list(filter(lambda x: 4< x <= 4.5, l_rating_m))), + len(list(filter(lambda x: 4.5< x <= 4.8, l_rating_f))),len(list(filter(lambda x: 4.5< x <= 4.8, l_rating_m))), + len(list(filter(lambda x: 4.8< x <= 5.0, l_rating_f))),len(list(filter(lambda x: 4.8< x <= 5.0, l_rating_m)))] +data_s=np.reshape(data_s, (5, 2)) +df_data_s = pd.DataFrame(data=data_s, columns=['female','male', ], + index=['0.0-2.0','2.0-4.0','4.0-4.5','4.5-4.8','4.9-5,0']) +df_data_s.to_csv('Table2_gender.csv') +df_data_s + + +# In[9]: + + +df_data_s.plot.bar(stacked=True, alpha=0.5,color=['#4f8686','#d2d2d2']) + + +# In[22]: + + +labels=['0.0-2.0','2.0-4.0','4.0-4.5','4.5-4.8','4.9-5,0'] +first=list(df_data_s['female']) +second=list(df_data_s['male']) +x = np.arange(len(labels)) +width = 0.25 +plt.bar(x - width/2, first, width, label='female',color='#4f8686') +plt.bar(x + width/2, second, width, label='male',color='#d2d2d2') +plt.ylabel('frequency') +plt.title('review_scores_rating') +plt.xticks(x, labels=labels) +plt.legend() +plt.savefig('figure2_gender.png') + + +# In[28]: + + +def t_test(x,y,alternative='both-sided'): + _, double_p = stats.ttest_ind(x,y,equal_var = False) + if alternative == 'both-sided': + pval = double_p + elif alternative == 'greater': + if np.mean(x) > np.mean(y): + pval = double_p/2. + else: + pval = 1.0 - double_p/2. + elif alternative == 'less': + if np.mean(x) < np.mean(y): + pval = double_p/2. + else: + pval = 1.0 - double_p/2. + return pval + + +# In[33]: + + +#p_value +round(t_test(l_rating_f, l_rating_m,'greater'),6) + + +# In[31]: + + +t_test(l_rating_f, l_rating_m,'greater')<0.05 + + +# In[ ]: + + + + -- GitLab