In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
years=range(1880, 2017)
pieces = []
columns = [ 'name', 'sex', 'births' ]
for year in years:
path = 'C:/Users/User/Desktop/python/pandas/Pandas_Python3/names/yob%d.txt' %year
frame = pd.read_csv(path, names=columns)
frame['year'] = year
pieces.append(frame)
names = pd.concat(pieces, ignore_index = True)
In [11]:
names_uniq = names.name.unique()
In [12]:
len(names.name.unique())
Out[12]:
In [24]:
group1 = difflib.get_close_matches('Mary', names_uniq, n=100, cutoff=0.8)
In [25]:
group1.append('Mary')
print (len(group1))
group1
Out[25]:
In [26]:
f_total = []
f_names = names[names['name'].isin(group1)][names.sex == 'F']
for year in years:
#все имена для мальчиков и девочек в год year
f_names_for_year = f_names[f_names.year == year]
names_for_year = names[names['name'].isin(group1)][names.year == year]
f_total.append((f_names_for_year['births'].sum() / names_for_year['births'].sum() )* 100)
In [27]:
plt.plot(years, f_total, color = 'red')
plt.title('Доли популярных имен в разрезе полов')
plt.xlabel('Года')
plt.ylabel('Доля в %')
plt.legend(['Ж'])
plt.show()
In [ ]:
f_total
In [ ]: