notebook.community

Edit and run



In [96]:

    
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns



In [2]:

    
countries = ['Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda',
             'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan',
             'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus',
             'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia']

life_expectancy_values = [74.7,  75. ,  83.4,  57.6,  74.6,  75.4,  72.3,  81.5,  80.2,
                          70.3,  72.1,  76.4,  68.1,  75.2,  69.8,  79.4,  70.8,  62.7,
                          67.3,  70.6]

gdp_values = [ 1681.61390973,   2155.48523109,  21495.80508273,    562.98768478,
              13495.1274663 ,   9388.68852258,   1424.19056199,  24765.54890176,
              27036.48733192,   1945.63754911,  21721.61840978,  13373.21993972,
                483.97086804,   9783.98417323,   2253.46411147,  25034.66692293,
               3680.91642923,    366.04496652,   1175.92638695,   1132.21387981]



In [3]:

    
# Life expectancy and gdp data in 2007 for 20 countries
life_expectancy = pd.Series(life_expectancy_values)
gdp = pd.Series(gdp_values)



In [33]:

    
def variable_correlation(variable1, variable2):
    
    mean1 = variable1.mean()
    mean2 = variable2.mean()
    
    num_same_direction = (len(variable1[variable1 > mean1][variable2 > mean2]) + len(variable1[variable1 < mean1][variable2 < mean2]))        # Replace this with your code
    num_different_direction = len(variable1) - (len(variable1[variable1 > mean1][variable2 > mean2]) + len(variable1[variable1 < mean1][variable2 < mean2]))  # Replace this with your code
    
    return (num_same_direction, num_different_direction)



In [34]:

    
variable_correlation(pd.Series([1,2,3,4]), pd.Series([2,3,1,5]))









    Out[34]:





(2, 2)



In [37]:

    
variable_correlation(pd.Series(life_expectancy_values), pd.Series(gdp_values))









    Out[37]:





(17, 3)



In [40]:

    
le = pd.Series(life_expectancy_values)
le.describe()









    Out[40]:





count    20.000000
mean     72.870000
std       6.213999
min      57.600000
25%      70.175000
50%      73.450000
75%      75.650000
max      83.400000
dtype: float64



In [41]:

    
countries = [
    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia',
    'Bosnia and Herzegovina'
]



In [42]:

    
employment_values = [
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076
]



In [43]:

    
employment = pd.Series(employment_values, index=countries)



In [46]:

    
def max_employment(employment):
    '''
    Fill in this function to return the name of the country
    with the highest employment in the given employment
    data, and the employment in that country.
    
    The input will be a Pandas series where the values
    are employment and the index is country names.
    
    Try using the Pandas argmax() function. Documention is
    here: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.argmax.html
    '''
    max_country = employment.argmax()      # Replace this with your code
    max_value = employment[max_country]   # Replace this with your code

    return (max_country, max_value)
print max_employment(employment)









    



('Angola', 75.699996949999999)



In [47]:

    
# Some more Panda practice



In [48]:

    
# Addition when indexes are the same
if True:
    s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
    print s1 + s2

# Indexes have same elements in a different order
if True:
    s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    s2 = pd.Series([10, 20, 30, 40], index=['b', 'd', 'a', 'c'])
    print s1 + s2

# Indexes overlap, but do not have exactly the same elements
if True:
    s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    s2 = pd.Series([10, 20, 30, 40], index=['c', 'd', 'e', 'f'])
    print s1 + s2

# Indexes do not overlap
if True:
    s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    s2 = pd.Series([10, 20, 30, 40], index=['e', 'f', 'g', 'h'])
    print s1 + s2









    



a    11
b    22
c    33
d    44
dtype: int64
a    31
b    12
c    43
d    24
dtype: int64
a     NaN
b     NaN
c    13.0
d    24.0
e     NaN
f     NaN
dtype: float64
a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
f   NaN
g   NaN
h   NaN
dtype: float64



In [49]:

    
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['c', 'd', 'e', 'f'])



In [51]:

    
(s1 + s2).fillna(0)









    Out[51]:





a     0.0
b     0.0
c    13.0
d    24.0
e     0.0
f     0.0
dtype: float64



In [52]:

    
s1.add(s2, fill_value=0)









    Out[52]:





a     1.0
b     2.0
c    13.0
d    24.0
e    30.0
f    40.0
dtype: float64



In [55]:

    
s = pd.Series([1, 2, 3, 4, 5])

def add_one(x):
        return x + 1

print s.apply(add_one)









    



0    2
1    3
2    4
3    5
4    6
dtype: int64



In [82]:

    
names1 = pd.Series([
    'Andre Agassi',
    'Barry Bonds',
    'Christopher Columbus',
    'Daniel Defoe',
    'Emilio Estevez',
    'Fred Flintstone',
    'Greta Garbo',
    'Humbert Humbert',
    'Ivan Ilych',
    'James Joyce',
    'Keira Knightley',
    'Lois Lane',
    'Mike Myers',
    'Nick Nolte',
    'Ozzy Osbourne',
    'Pablo Picasso',
    'Quirinus Quirrell',
    'Rachael Ray',
    'Susan Sarandon',
    'Tina Turner',
    'Ugueth Urbina',
    'Vince Vaughn',
    'Woodrow Wilson',
    'Yoji Yamada',
    'Zinedine Zidane'
])



In [83]:

    
names1=pd.Series(['Andre Agassi', 'Barry Bonds', 'Christopher Columbus', 'Daniel Defoe'],
                index=[0, 1, 2, 3])



In [86]:

    
def reverse_name(name):
    return (name.split())[1] + ' ' + (name.split())[0]
print reverse_name("Aavni Garg")









    



Garg Aavni



In [87]:

    
def reverse_names(names):
    '''
    Fill in this function to return a new series where each name
    in the input series has been transformed from the format
    "Firstname Lastname" to "Lastname, FirstName".
    
    Try to use the Pandas apply() function rather than a loop.
    '''
    return names.apply(reverse_name)



In [88]:

    
print reverse_names(names1)









    



0            Agassi Andre
1             Bonds Barry
2    Columbus Christopher
3            Defoe Daniel
dtype: object



In [95]:

    
data = pd.Series([3,4,5,6,7,8])
data.plot()









    Out[95]:





<matplotlib.axes._subplots.AxesSubplot at 0x7faa3e506790>



In [100]:

    
# The following code reads all the Gapminder data into Pandas DataFrames. You'll
# learn about DataFrames next lesson.

path = ''
employment = pd.read_csv(path + 'employment_above_15.csv', index_col='Country')
female_completion = pd.read_csv(path + 'female_completion_rate.csv', index_col='Country')
male_completion = pd.read_csv(path + 'male_completion_rate.csv', index_col='Country')
life_expectancy = pd.read_csv(path + 'life_expectancy.csv', index_col='Country')
gdp = pd.read_csv(path + 'gdp_per_capita.csv', index_col='Country')



In [ ]:

    
# The following code creates a Pandas Series for each variable for the United States.
# You can change the string 'United States' to a country of your choice.

employment_us = employment.loc['United States']
female_completion_us = female_completion.loc['United States']
male_completion_us = male_completion.loc['United States']
life_expectancy_us = life_expectancy.loc['United States']
gdp_us = gdp.loc['United States']



In [103]:

    
employment_india = employment.loc['India']
female_completion_india = female_completion.loc['India']
male_completion_india = male_completion.loc['India']
life_expectancy_india = life_expectancy.loc['India']
gdp_india = gdp.loc['India']



In [104]:

    
# Uncomment the following line of code to see the available country names
print employment.index.values









    



['Afghanistan' 'Albania' 'Algeria' 'Angola' 'Argentina' 'Armenia'
 'Australia' 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh'
 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bhutan' 'Bolivia'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'Brunei' 'Bulgaria'
 'Burkina Faso' 'Burundi' 'Cambodia' 'Cameroon' 'Canada' 'Cape Verde'
 'Central African Rep.' 'Chad' 'Chile' 'China' 'Colombia' 'Comoros'
 'Congo, Rep.' 'Congo, Dem. Rep.' 'Costa Rica' "Cote d'Ivoire" 'Croatia'
 'Cuba' 'Cyprus' 'Czech Rep.' 'Denmark' 'Dominican Rep.' 'Timor-Leste'
 'Ecuador' 'Egypt' 'El Salvador' 'Equatorial Guinea' 'Eritrea' 'Estonia'
 'Ethiopia' 'Fiji' 'Finland' 'France' 'Gabon' 'Gambia' 'Georgia' 'Germany'
 'Ghana' 'Greece' 'Guadeloupe' 'Guatemala' 'Guinea' 'Guinea-Bissau'
 'Guyana' 'Haiti' 'Honduras' 'Hong Kong, China' 'Hungary' 'Iceland' 'India'
 'Indonesia' 'Iran' 'Iraq' 'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan'
 'Jordan' 'Kazakhstan' 'Kenya' 'Korea, Dem. Rep.' 'Korea, Rep.' 'Kuwait'
 'Kyrgyzstan' 'Laos' 'Latvia' 'Lebanon' 'Lesotho' 'Liberia' 'Libya'
 'Lithuania' 'Luxembourg' 'Macao, China' 'Madagascar' 'Malawi' 'Malaysia'
 'Maldives' 'Mali' 'Malta' 'Martinique' 'Mauritania' 'Mauritius' 'Mexico'
 'Mongolia' 'Morocco' 'Mozambique' 'Myanmar' 'Namibia' 'Nepal'
 'Netherlands' 'Netherlands Antilles' 'New Zealand' 'Nicaragua' 'Niger'
 'Nigeria' 'Norway' 'Oman' 'Pakistan' 'Panama' 'Papua New Guinea'
 'Paraguay' 'Peru' 'Philippines' 'Poland' 'Portugal' 'Puerto Rico' 'Qatar'
 'Moldova' 'Reunion' 'Romania' 'Russia' 'Rwanda' 'Saudi Arabia' 'Senegal'
 'Serbia and Montenegro' 'Sierra Leone' 'Singapore' 'Slovak Republic'
 'Slovenia' 'Solomon Islands' 'Somalia' 'South Africa' 'Spain' 'Sri Lanka'
 'Sudan' 'Suriname' 'Swaziland' 'Sweden' 'Switzerland' 'Syria' 'Taiwan'
 'Tajikistan' 'Tanzania' 'Thailand' 'Macedonia, FYR' 'Togo'
 'Trinidad and Tobago' 'Tunisia' 'Turkey' 'Turkmenistan' 'Uganda' 'Ukraine'
 'United Arab Emirates' 'United Kingdom' 'United States' 'Uruguay'
 'Uzbekistan' 'Venezuela' 'Vietnam' 'West Bank and Gaza' 'Yemen, Rep.'
 'Zambia' 'Zimbabwe']



In [105]:

    
# Use the Series defined above to create a plot of each variable over time for
# the country of your choice. You will only be able to display one plot at a time
# with each "Test Run".
employment_us.plot()









    Out[105]:





<matplotlib.axes._subplots.AxesSubplot at 0x7faa409ad910>



In [106]:

    
employment_india.plot()









    Out[106]:





<matplotlib.axes._subplots.AxesSubplot at 0x7faa36124210>



In [107]:

    
female_completion_us.plot()









    Out[107]:





<matplotlib.axes._subplots.AxesSubplot at 0x7faa360b1590>



In [108]:

    
female_completion_india.plot()









    Out[108]:





<matplotlib.axes._subplots.AxesSubplot at 0x7faa36007a90>



In [109]:

    
male_completion_us.plot()









    Out[109]:





<matplotlib.axes._subplots.AxesSubplot at 0x7faa35edd990>



In [111]:

    
male_completion_india.plot()









    Out[111]:





<matplotlib.axes._subplots.AxesSubplot at 0x7faa35d75ad0>



In [112]:

    
life_expectancy_us.plot()









    Out[112]:





<matplotlib.axes._subplots.AxesSubplot at 0x7faa35cc6750>



In [113]:

    
life_expectancy_india.plot()









    Out[113]:





<matplotlib.axes._subplots.AxesSubplot at 0x7faa35c6e5d0>



In [ ]: