Import pandas with the right name
In [2]:
import pandas as pd
Set all graphics from matplotlib to display inline
In [3]:
%matplotlib inline
Read the csv in (it should be UTF-8 already so you don't have to worry about encoding), save it with the proper boring name
In [ ]:
df = pd.read_csv('07-hw-animals.csv')
Display the names of the columns in the csv
In [5]:
df.columns
Out[5]:
Display the first 3 animals.
In [6]:
df['animal'].head(3)
Out[6]:
Sort the animals to see the 3 longest animals.
In [7]:
df.sort_values('length', ascending = False).head(3)
Out[7]:
In [ ]:
# or
df.sort_values('length').tail(3)
What are the counts of the different values of the "animal" column? a.k.a. how many cats and how many dogs.
In [8]:
df['animal'].value_counts()
Out[8]:
Only select the dogs.
In [9]:
df[df['animal'] == 'dog']
Out[9]:
Display all of the animals that are greater than 40 cm.
In [10]:
df[df['length'] > 40]
Out[10]:
'length' is the animal's length in cm. Create a new column called inches that is the length in inches.
In [11]:
inches = df['length'] * 0.393701
df['inches'] = inches
Save the cats to a separate variable called "cats." Save the dogs to a separate variable called "dogs."
In [12]:
cats = df[df['animal'] == 'cat']
dogs = df[df['animal'] == 'dog']
Display all of the animals that are cats and above 12 inches long. First do it using the "cats" variable, then do it using your normal dataframe.
In [13]:
cats[cats['inches'] > 12]
Out[13]:
In [14]:
df[(df['animal'] == 'cat') & (df['length'] > 12)]
Out[14]:
What's the mean length of a cat?
In [15]:
cats['inches'].describe()
Out[15]:
What's the mean length of a dog?
In [16]:
dogs['inches'].describe()
Out[16]:
Use groupby to accomplish both of the above tasks at once.
In [17]:
df.groupby('animal').describe()
Out[17]:
Make a histogram of the length of dogs. I apologize that it is so boring.
In [18]:
dogs['inches'].hist()
Out[18]:
Change your graphing style to be something else (anything else!)
In [19]:
import matplotlib.pyplot as plt
plt.style.use('seaborn-deep')
dogs['inches'].hist()
Out[19]:
Make a horizontal bar graph of the length of the animals, with their name as the label (look at the billionaires notebook I put on Slack!)
In [20]:
df.plot(kind = 'barh', x = 'name', y = 'inches', legend = False)
Out[20]:
Make a sorted horizontal bar graph of the cats, with the larger cats on top.
In [21]:
cats.sort_values('inches').plot(kind = 'barh', x = 'name', y = 'inches', legend=False)
Out[21]:
Answer your own selection out of the following questions, or any other questions you might be able to think of. Write the question down first in a markdown cell (use a # to make the question a nice header), THEN try to get an answer to it. A lot of these are remarkably similar, and some you'll need to do manual work for - the GDP ones, for example.
If you are trying to figure out some other question that we didn't cover in class and it does not have to do with joining to another data set, we're happy to help you figure it out during lab!
Take a peek at the billionaires notebook I uploaded into Slack; it should be helpful for the graphs (I added a few other styles and options, too). You'll probably also want to look at the "sum()" line I added.
In [22]:
df = pd.read_excel('richpeople.xlsx')
df = df[df['year'] == 2014]
In [24]:
cp = pd.read_excel('API_SP_POP_TOTL_DS2_en_excel_v2_toprowsremoved.xls')
In [28]:
pop_df = pd.merge(df, cp[['Country Code', '2014']], how = 'left', left_on = 'countrycode', right_on = 'Country Code')
In [30]:
dict_freq_countries = pop_df['citizenship'].value_counts().head(10).to_dict()
dict_freq_countries
Out[30]:
In [33]:
for x in dict_freq_countries:
country_pop = pop_df[pop_df['citizenship'] == x].head(1).to_dict()
for key in country_pop['2014'].keys():
print(x, 'has', dict_freq_countries[x] / (country_pop['2014'][key] / 1000000000), 'billionaires per billion people.')
if country_pop['2014'][key] / 1000000000 < 1:
print('Of course, this is a nonsense figure for a country with less than a billion people.')
print('')
In [34]:
df['citizenship'].value_counts().head(10)
Out[34]:
In [35]:
populations = [
{'country': 'United States', 'pop': 0.3214},
{'country': 'Germany', 'pop': 0.0809},
{'country': 'China' , 'pop': 1.3675},
{'country': 'Russia', 'pop': 0.1424},
{'country': 'Japan', 'pop': 0.1269},
{'country': 'Brazil' , 'pop': 0.2043},
{'country': 'Hong Kong' , 'pop': 0.0071},
{'country': 'France', 'pop': 0.0666},
{'country': 'United Kingdom', 'pop': 0.0641},
{'country': 'India', 'pop': 1.2517}, ]
for item in list(range(9)):
print(populations[item]['country'], 'has', df['citizenship'].value_counts()[item] / populations[item]['pop'], 'billionaires per billion people.')
if populations[item]['pop'] < 1:
print('Of course, this is a nonsense figure for a country with less than a billion people.')
print('')
#pop are in billions and based off of the CIA Factbook 2015 estimate
In [36]:
df[['name', 'rank', 'networthusbillion']].sort_values('networthusbillion', ascending = False).head(10)
Out[36]:
In [37]:
df[['gender', 'networthusbillion']].groupby('gender').describe()
Out[37]:
In [38]:
df[['name', 'rank', 'networthusbillion']].sort_values('networthusbillion').head(10)
Out[38]:
In [39]:
poorest_billionaires = df[(df['networthusbillion']) == (df['networthusbillion'].sort_values().head(1).values[0])]
print('But there are', poorest_billionaires['name'].count(), 'billionaires making just as little money:')
print('')
print(poorest_billionaires[['name', 'rank', 'networthusbillion']])
According to the PDF, relationship to company "describes the billionaire's relationship to the company primarily responsible for their wealth, such as founder, executive, relation, or shareholder"
In [40]:
df['relationshiptocompany'].value_counts()
Out[40]:
In [41]:
print('Most common source of wealth:')
df['sourceofwealth'].value_counts().head(1)
Out[41]:
In [42]:
print('The most common source of wealth for females and males:')
In [43]:
df[['gender', 'sourceofwealth']].groupby('gender').describe()
Out[43]:
In [44]:
gdp = pd.read_excel('API_NY_GDP_MKTP_CD_DS2_en_excel_v2_rowsremoved.xls')
In [46]:
gdp.columns
Out[46]:
In [47]:
gdp_df = pd.merge(df, gdp[['Country Code', '2014']], how = 'left', left_on = 'countrycode', right_on = 'Country Code')
In [48]:
gdp_df.head(1)
Out[48]:
In [49]:
gdp_df[['name', 'citizenship', 'networthusbillion', '2014']].groupby('citizenship').max() #gives the max for each country
Out[49]:
In [50]:
gdp_dict = gdp_df[['name', 'citizenship', 'networthusbillion', '2014']].groupby('citizenship').max().to_dict()
In [51]:
for country in gdp_dict['2014']:
print(country)
gdp_bill = gdp_dict['2014'][country] / 1000000000
print('gdp in billions:', gdp_bill)
print('richest billionaire:', gdp_dict['name'][country])
print('how many billions:', gdp_dict['networthusbillion'][country])
print('percent of gdp:', gdp_dict['networthusbillion'][country] / gdp_bill * 100)
print('')
In [52]:
gdp_df[['citizenship', 'networthusbillion', '2014']].groupby('citizenship').sum() #gives the sum for each country
Out[52]:
In [53]:
bill_df = gdp_df[['citizenship', 'networthusbillion', '2014']].groupby('citizenship').sum().to_dict()
In [54]:
for country in bill_df['2014']:
print(country)
gdp_bill = bill_df['2014'][country] / 1000000000
print('gdp in billions:', gdp_bill)
print('how many billions the billionaires there make:', bill_df['networthusbillion'][country])
print('percent of gdp:', bill_df['networthusbillion'][country] / gdp_bill * 100)
print('')
In [55]:
for country in bill_df['2014']:
if country == 'United States':
country1 = country
print(country)
gdp_bill1 = bill_df['2014'][country] / 1000000000
print('gdp in billions:', gdp_bill1)
billions1 = bill_df['networthusbillion'][country]
print('how many billions:', billions1)
percent1 = bill_df['networthusbillion'][country] / gdp_bill1 * 100
print('percent of gdp:', percent1)
print('')
elif country == 'India':
country2 = country
print(country)
gdp_bill2 = bill_df['2014'][country] / 1000000000
print('gdp in billions:', gdp_bill2)
billions2 = bill_df['networthusbillion'][country]
print('how many billions:', billions2)
percent2 = bill_df['networthusbillion'][country] / gdp_bill2 * 100
print('percent of gdp:', percent2)
print('')
print(country1 + "'s GDP is", gdp_bill1 / gdp_bill2, 'times that of', country2)
print(country1, 'billionaires make', billions1 / billions2, 'times the money those in', country2, 'do')
print(country1, 'billionaires share of their countrys\'s GDP is', percent1 / percent2, 'times that of those living in', country2)
In [56]:
# df.columns
Out[56]:
In [57]:
# df[['networthusbillion', 'industry', 'sector']].head()
Out[57]:
In [79]:
print('The most common industries for billionaires to come from:')
df['industry'].value_counts().head()
Out[79]:
In [80]:
print('The total amount of billionaire money in each industry:')
df[['industry', 'networthusbillion']].groupby('industry').sum().sort_values('networthusbillion', ascending = False)
Out[80]:
In [82]:
df['selfmade'].value_counts()
Out[82]:
In [90]:
df['age'].hist()
Out[90]:
In [84]:
df['age'].describe()
Out[84]:
In [87]:
df[df['selfmade'] == 'self-made']['age'].hist()
Out[87]:
In [91]:
df[df['selfmade'] == 'self-made']['age'].describe()
Out[91]:
In [92]:
df[df['selfmade'] == 'inherited']['age'].hist()
Out[92]:
In [93]:
df[df['selfmade'] == 'inherited']['age'].describe()
Out[93]:
In [96]:
df[['age', 'industry']].groupby('industry').mean().sort_values('age')
Out[96]:
In [97]:
df[['name', 'age']].sort_values('age').head()
Out[97]:
In [98]:
df[['name', 'age']].sort_values('age', ascending=False).head()
Out[98]:
In [99]:
df['age'].hist()
Out[99]:
In [100]:
df['networthusbillion'].hist()
Out[100]:
In [107]:
# df['networthusbillion'].sort_values(ascending = False).head(10)
Out[107]:
In [111]:
df[['networthusbillion', 'age']].plot(kind = 'scatter', x = 'networthusbillion', y = 'age')
Out[111]:
In [119]:
df[['name', 'networthusbillion']].sort_values('networthusbillion', ascending = False).head(10).plot(kind = 'bar', x = 'name', y = 'networthusbillion')
Out[119]:
In [ ]:
df = pd.read_json('https://data.sfgov.org/api/views/gxxq-x39z/rows.json')
# Can't get this to work! And I can't save the source code for some reason.
In [ ]:
df.head()