In [107]:
import pandas as pd

In [108]:
import matplotlib.pyplot as plt

In [109]:
%matplotlib inline

In [18]:
df = pd.read_csv("07-hw-animals.csv")
print(df)


  animal        name  length
0    cat        Anne      35
1    cat         Bob      45
2    dog  Egglesburg      65
3    dog       Devon      50
4    cat     Charlie      32
5    dog    Fontaine      35

In [10]:
print(df.columns.values)


['animal' 'name' 'length']

In [13]:
print(df['animal'])


0    cat
1    cat
2    dog
3    dog
4    cat
5    dog
Name: animal, dtype: object

In [19]:
print(df[:3])


  animal        name  length
0    cat        Anne      35
1    cat         Bob      45
2    dog  Egglesburg      65

In [27]:
print(df)


  animal        name  length
0    cat        Anne      35
1    cat         Bob      45
2    dog  Egglesburg      65
3    dog       Devon      50
4    cat     Charlie      32
5    dog    Fontaine      35

In [32]:
print(df.sort_values(by='length', ascending=0)[:3])


  animal        name  length
2    dog  Egglesburg      65
3    dog       Devon      50
1    cat         Bob      45

In [45]:
print(df['animal'])


0    cat
1    cat
2    dog
3    dog
4    cat
5    dog
Name: animal, dtype: object

In [66]:
print(df['animal'].value_counts())


cat    3
dog    3
Name: animal, dtype: int64

In [77]:
dogs = df[df['animal'] == 'dog']
dogs


Out[77]:
animal name length
2 dog Egglesburg 65
3 dog Devon 50
5 dog Fontaine 35

In [78]:
df[df['length'] > 40]


Out[78]:
animal name length
1 cat Bob 45
2 dog Egglesburg 65
3 dog Devon 50

In [81]:
df['inches'] = df['length'] * .394
df


Out[81]:
animal name length inches
0 cat Anne 35 13.790
1 cat Bob 45 17.730
2 dog Egglesburg 65 25.610
3 dog Devon 50 19.700
4 cat Charlie 32 12.608
5 dog Fontaine 35 13.790

In [90]:
cats = df[df['animal'] == 'cat']
cats


Out[90]:
animal name length inches
0 cat Anne 35 13.790
1 cat Bob 45 17.730
4 cat Charlie 32 12.608

In [91]:
dogs = df[df['animal'] == 'dog']
dogs


Out[91]:
animal name length inches
2 dog Egglesburg 65 25.61
3 dog Devon 50 19.70
5 dog Fontaine 35 13.79

In [92]:
cats[cats['inches'] > 12]


Out[92]:
animal name length inches
0 cat Anne 35 13.790
1 cat Bob 45 17.730
4 cat Charlie 32 12.608

In [99]:
df[df['inches'] > 12]
df[df['animal'] == 'cat']


Out[99]:
animal name length inches
0 cat Anne 35 13.790
1 cat Bob 45 17.730
4 cat Charlie 32 12.608

In [103]:
cats['length'].mean()


Out[103]:
37.333333333333336

In [104]:
dogs['length'].mean()


Out[104]:
50.0

In [105]:
df.groupby('animal')['length'].mean()


Out[105]:
animal
cat    37.333333
dog    50.000000
Name: length, dtype: float64

In [110]:
dogs['length'].hist()


Out[110]:
<matplotlib.axes._subplots.AxesSubplot at 0x6103910>

In [111]:
dogs.plot(kind='scatter', x='length', y='inches')


Out[111]:
<matplotlib.axes._subplots.AxesSubplot at 0x6220cd0>

In [113]:
df.plot(kind='barh', x='name', y='length', legend=False)


Out[113]:
<matplotlib.axes._subplots.AxesSubplot at 0x7223bd0>

In [119]:
sortcats = (cats.sort_values(by='length', ascending=0))
sortcats.plot(kind='barh', x='name', y='length', legend=False, sort_columns=False)


Out[119]:
<matplotlib.axes._subplots.AxesSubplot at 0x730ead0>

In [117]:
cats


Out[117]:
animal name length inches
0 cat Anne 35 13.790
1 cat Bob 45 17.730
4 cat Charlie 32 12.608

In [5]:
import pandas as pd
df = pd.read_excel("richpeople.xlsx")

What country are most billionaires from? For the top ones, how many billionaires per billion people? Who are the top 10 richest billionaires? What's the average wealth of a billionaire? Male? Female? Who is the poorest billionaire? Who are the top 10 poorest billionaires? 'What is relationship to company'? And what are the most common relationships? Most common source of wealth? Male vs. female? Given the richest person in a country, what % of the GDP is their wealth? Add up the wealth of all of the billionaires in a given country (or a few countries) and then compare it to the GDP of the country, or other billionaires, so like pit the US vs India What are the most common industries for billionaires to come from? What's the total amount of billionaire money from each industry? How many self made billionaires vs. others? How old are billionaires? How old are billionaires self made vs. non self made? or different industries? Who are the youngest billionaires? The oldest? Age distribution - maybe make a graph about it? Maybe just made a graph about how wealthy they are in general? Maybe plot their net worth vs age (scatterplot) Make a bar graph of the top 10 or 20 richest

How many female billionaires are there compared to male? What industries are they from? What is their average wealth?


In [6]:
%matplotlib inline

In [7]:
print(df['gender'].value_counts())


male              2328
female             249
married couple       3
Name: gender, dtype: int64

In [8]:
df.groupby('gender')['networthusbillion'].mean()


Out[8]:
gender
female            3.819277
male              3.516881
married couple    1.300000
Name: networthusbillion, dtype: float64

In [10]:
df.groupby('gender')['sourceofwealth'].value_counts()


Out[10]:
gender  sourceofwealth                   
female  diversified                          9
        real estate                          7
        media                                6
        construction                         5
        consumer goods                       5
        hotels, investments                  5
        Wal-Mart                             4
        casinos                              4
        chemicals                            4
        cleaning products                    4
        Samsung                              3
        banking                              3
        commodities                          3
        mining                               3
        packaging                            3
        pipelines                            3
        retail                               3
        Campbell Soup                        2
        Cargill Inc.                         2
        bank, media                          2
        banking inheritance                  2
        coffee                               2
        financial services                   2
        hotels, restaurants                  2
        inherited, cosmetics                 2
        insurance                            2
        investments                          2
        medical equipment                    2
        paper                                2
        pharmaceuticals                      2
                                            ..
male    telecom, oil service, real estate    1
        telecom, oil, beer                   1
        telecoms                             1
        telecoms/lotteries/insurance         1
        television, Univision                1
        temp agency                          1
        textiles, apparel                    1
        timber/media                         1
        timberland, lumber mills             1
        tobacco                              1
        tobacco distribution, retail         1
        tobacco, banking                     1
        tools                                1
        tourism, construction                1
        tractors                             1
        trading company                      1
        transport                            1
        travel                               1
        vaccines                             1
        vacuums                              1
        venture capital, Google              1
        video cameras                        1
        videogames                           1
        water                                1
        water treatment systems              1
        web hosting                          1
        wind turbines                        1
        wine                                 1
        winter jackets                       1
        wrestling                            1
Name: sourceofwealth, dtype: int64

Let's make a graph 'bout it


In [12]:
df.plot(kind='scatter', x='gender', y='networthusbillion')


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pandas\indexes\base.py in get_loc(self, key, method, tolerance)
   1944             try:
-> 1945                 return self._engine.get_loc(key)
   1946             except KeyError:

pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:4066)()

pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:3930)()

pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12408)()

pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12359)()

KeyError: 'gender'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-12-3ef5d449adfa> in <module>()
----> 1 df.plot(kind='scatter', x='gender', y='networthusbillion')

c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pandas\tools\plotting.py in __call__(self, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
   3738                           fontsize=fontsize, colormap=colormap, table=table,
   3739                           yerr=yerr, xerr=xerr, secondary_y=secondary_y,
-> 3740                           sort_columns=sort_columns, **kwds)
   3741     __call__.__doc__ = plot_frame.__doc__
   3742 

c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pandas\tools\plotting.py in plot_frame(data, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
   2612                  yerr=yerr, xerr=xerr,
   2613                  secondary_y=secondary_y, sort_columns=sort_columns,
-> 2614                  **kwds)
   2615 
   2616 

c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pandas\tools\plotting.py in _plot(data, x, y, subplots, ax, kind, **kwds)
   2439         plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
   2440 
-> 2441     plot_obj.generate()
   2442     plot_obj.draw()
   2443     return plot_obj.result

c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pandas\tools\plotting.py in generate(self)
   1026         self._compute_plot_data()
   1027         self._setup_subplots()
-> 1028         self._make_plot()
   1029         self._add_table()
   1030         self._make_legend()

c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pandas\tools\plotting.py in _make_plot(self)
   1598         else:
   1599             label = None
-> 1600         scatter = ax.scatter(data[x].values, data[y].values, c=c_values,
   1601                              label=label, cmap=cmap, **self.kwds)
   1602         if cb:

c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   1995             return self._getitem_multilevel(key)
   1996         else:
-> 1997             return self._getitem_column(key)
   1998 
   1999     def _getitem_column(self, key):

c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
   2002         # get column
   2003         if self.columns.is_unique:
-> 2004             return self._get_item_cache(key)
   2005 
   2006         # duplicate columns & possible reduce dimensionality

c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
   1348         res = cache.get(item)
   1349         if res is None:
-> 1350             values = self._data.get(item)
   1351             res = self._box_item_values(item, values)
   1352             cache[item] = res

c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
   3288 
   3289             if not isnull(item):
-> 3290                 loc = self.items.get_loc(item)
   3291             else:
   3292                 indexer = np.arange(len(self.items))[isnull(self.items)]

c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pandas\indexes\base.py in get_loc(self, key, method, tolerance)
   1945                 return self._engine.get_loc(key)
   1946             except KeyError:
-> 1947                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   1948 
   1949         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:4066)()

pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:3930)()

pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12408)()

pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12359)()

KeyError: 'gender'

In [ ]: