In [3]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [4]:
!pip3 install xlrd
In [5]:
df = pd.read_excel("richpeople.xlsx")
In [14]:
recent = df[df['year'] == 2014] #recent is a variable, a variable can be assigned to different things, here it was assigned to a data frame
recent.head()
Out[14]:
In [8]:
recent.columns.values
Out[8]:
In [9]:
recent['countrycode'].value_counts() #value_counts counts每个country出现的次数
Out[9]:
In [10]:
recent.sort_values(by='networthusbillion', ascending=False).head(10) #sort_values reorganizes the data basde on the by column
Out[10]:
In [15]:
recent['networthusbillion'].describe()
# the average wealth of a billionaire is $3.9 billion
Out[15]:
In [17]:
recent.groupby('gender')['networthusbillion'].describe()#group by is a function, group everything by gender, and show the billionnetworth
# female mean is 3.920556 billion
# male mean is 3.902716 billion
Out[17]:
In [12]:
recent.sort_values(by='rank',ascending=False).head(10)
Out[12]:
In [19]:
recent['relationshiptocompany']
Out[19]:
In [20]:
recent['relationshiptocompany'].describe()
# the most common relationship to company is founder
Out[20]:
In [21]:
recent['sourceofwealth'].describe()
# the most common source of wealth is real estate
Out[21]:
In [26]:
recent.groupby('gender')['sourceofwealth'].describe() #describe the content of a given column
# the most common source of wealth for male is real estate, while for female is diversified
Out[26]:
In [35]:
recent.sort_values(by='networthusbillion', ascending=False).head(10)['gdpcurrentus']
Out[35]:
In [30]:
#From the website, I learned that the GDP for USA in 2014 is $17348 billion
#from the previous dataframe, I learned that the richest USA billionaire made $76 billion networth
richest = 76
usa_gdp = 17348
percent = round(richest / usa_gdp * 100,2)
print(percent, "% of the US GDP is his wealth.")
In [36]:
recent.groupby('countrycode')['networthusbillion'].sum().sort_values(ascending=False)
# USA is $2322 billion, compared to Russian is $422 billion
Out[36]:
In [37]:
recent['sourceofwealth'].describe()
Out[37]:
In [39]:
recent.groupby('sourceofwealth')['networthusbillion'].sum().sort_values(ascending=False)
Out[39]:
In [ ]:
How old are billionaires? How old are billionaires self made vs. non self made? or different industries?
Who are the youngest billionaires? The oldest? Age distribution - maybe make a graph about it?
Maybe just made a graph about how wealthy they are in general?
Maybe plot their net worth vs age (scatterplot)
Make a bar graph of the top 10 or 20 richest
In [41]:
recent['selfmade'].value_counts()
Out[41]:
In [45]:
recent.sort_values(by='age',ascending=False).head()
Out[45]:
In [50]:
columns_want = recent[['name', 'age', 'selfmade','industry']] #[[]]:dataframe
columns_want.head()
Out[50]:
In [ ]: