Guided project from dataquest.io Data Scientist path.
Data provided by fivethirtyeight.
In [1]:
from pathlib import Path
my_file = Path('US_births_1994-2003_CDC_NCHS.csv')
if my_file.is_file():
print('File exists.')
data = open('US_births_1994-2003_CDC_NCHS.csv', 'r').read()
data_lst = data.split('\n')
else:
print("File doesn't exist, will be downloaded.")
import urllib.request
url = 'https://raw.githubusercontent.com/fivethirtyeight/' + \
'data/master/births/US_births_1994-2003_CDC_NCHS.csv'
response = urllib.request.urlopen(url)
data = response.read().decode('utf-8')
with open('US_births_1994-2003_CDC_NCHS.csv', 'w') as file:
file.write(data)
data_lst = data.split('\r')
In [2]:
print(data_lst[:10])
In [3]:
def read_csv(filename):
data = open(filename, 'r').read()
string_list = data.split('\n')
final_list = []
for item in string_list[1:]:
int_fields = []
string_fields = item.split(',')
int_fields = list(map(lambda x: int(x), string_fields))
final_list.append(int_fields)
return final_list
In [4]:
cdc_list = read_csv('US_births_1994-2003_CDC_NCHS.csv')
In [5]:
print(cdc_list[:10])
In [6]:
def month_births(lst):
births_per_month = {}
for item in lst:
month = item[0]
births = item[-1]
if month in births_per_month:
births_per_month[month] += births
else:
births_per_month[month] = births
return births_per_month
In [7]:
cdc_month_births = month_births(cdc_list)
In [8]:
print(cdc_month_births)
In [9]:
def dow_births(lst):
births_per_day = {}
for item in lst:
day_of_week = item[-2]
births = item[-1]
if day_of_week in births_per_day:
births_per_day[day_of_week] += births
else:
births_per_day[day_of_week] = births
return births_per_day
In [10]:
cdc_day_births = dow_births(cdc_list)
In [11]:
print(cdc_day_births)
In [12]:
def calc_counts(data, column):
feature_count = {}
for item in data:
feature = item[column]
births = item[-1]
if feature in feature_count:
feature_count[feature] += births
else:
feature_count[feature] = births
return feature_count
In [13]:
cdc_year_births = calc_counts(cdc_list, 0)
cdc_month_births = calc_counts(cdc_list, 1)
cdc_dom_births = calc_counts(cdc_list, 2)
cdc_dow_births = calc_counts(cdc_list, 3)
In [14]:
print('### Year ###')
print(cdc_year_births)
print('---')
print('### Month ###')
print(cdc_month_births)
print('---')
print('### Day of month ###')
print(cdc_dom_births)
print('---')
print('### Day of week ###')
print(cdc_dow_births)
In [15]:
# the lazy approach
def min_dict(summary):
return min(summary.values())
def max_dict(summary):
return max(summary.values())
In [16]:
print('Minimum births per year: %i' % min_dict(cdc_year_births))
print('Maximum births per year: %i' % max_dict(cdc_year_births))
In [17]:
import matplotlib.pyplot as plt
from matplotlib import style
%matplotlib inline
style.use('ggplot')
In [18]:
cdc_year_births_sorted = sorted(cdc_year_births.items())
x, y = zip(*cdc_year_births_sorted)
In [19]:
ann_x = []
for i in range(len(x)):
temp = (y[i] - y[i-1]) / y[i-1] * 100
temp = temp * 100
ann_x.append(int(temp))
In [20]:
plt.plot(x, y, '-o')
plt.xlim([1993, 2004]) # , 0, 20
plt.ylim([3850000, 4170000])
plt.xlabel('Years')
plt.ylabel('Births')
plt.annotate('-',xy=(x[0] - 0.05, y[0]+10000))
for i in range(len(x))[1:]:
if ann_x[i] > 0:
col = '#17aa0c'
else:
col = '#ff0000'
plt.annotate(str(ann_x[i]) + '%', color=col,
xy=(x[i], y[i]+10000),
xytext=(x[i]-0.3, y[i]+20000),
#arrowprops=dict(facecolor='black')
)
#plt.show()
Further proposals (source):