In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

years=range(1880, 2017)

pieces = []
columns = [ 'name', 'sex', 'births' ]
for year in years:
    path = 'C:/Users/User/Desktop/python/pandas/Pandas_Python3/names/yob%d.txt' %year
    frame = pd.read_csv(path, names=columns)
    frame['year'] = year
    pieces.append(frame)

names = pd.concat(pieces, ignore_index = True)

In [11]:
names_uniq = names.name.unique()

In [12]:
len(names.name.unique())


Out[12]:
96174

In [24]:
group1 = difflib.get_close_matches('Mary', names_uniq, n=100, cutoff=0.8)

In [25]:
group1.append('Mary')
print (len(group1))
group1


81
Out[25]:
['Mary',
 'Mmary',
 'Maury',
 'Maryn',
 'Maryl',
 'Marye',
 'Marya',
 'Marty',
 'Marry',
 'Marny',
 'Marly',
 'Marky',
 'Margy',
 'Marey',
 'Mardy',
 'Marcy',
 'Maray',
 'Mabry',
 'Mry',
 'May',
 'Mar',
 'Merary',
 'Mavryk',
 'Mavery',
 'Mauryn',
 'Maurya',
 'Maurey',
 'Maudry',
 'Maryum',
 'Maryse',
 'Marysa',
 'Maryon',
 'Maryna',
 'Marylu',
 'Maryln',
 'Maryla',
 'Maryke',
 'Maryka',
 'Maryjo',
 'Maryia',
 'Maryha',
 'Maryem',
 'Maryel',
 'Maryan',
 'Maryam',
 'Maryah',
 'Marvyn',
 'Marvyl',
 'Martyn',
 'Marthy',
 'Martay',
 'Marryn',
 'Marney',
 'Marnay',
 'Marlys',
 'Marlyn',
 'Marlye',
 'Marley',
 'Markya',
 'Markey',
 'Markay',
 'Mariya',
 'Marivy',
 'Marily',
 'Mariby',
 'Margey',
 'Mareya',
 'Marely',
 'Marcys',
 'Marcey',
 'Maraya',
 'Malory',
 'Malery',
 'Malary',
 'Mairyn',
 'Maevry',
 'Maeryn',
 'Maebry',
 'Macray',
 'Mabrey',
 'Mary']

In [26]:
f_total = []
 
f_names = names[names['name'].isin(group1)][names.sex == 'F']

for year in years:
    #все имена для мальчиков и девочек в год year
    f_names_for_year = f_names[f_names.year == year] 
    names_for_year = names[names['name'].isin(group1)][names.year == year]
    f_total.append((f_names_for_year['births'].sum() / names_for_year['births'].sum() )* 100)


C:\Users\User\Anaconda3\lib\site-packages\ipykernel\__main__.py:3: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  app.launch_new_instance()
C:\Users\User\Anaconda3\lib\site-packages\ipykernel\__main__.py:8: UserWarning: Boolean Series key will be reindexed to match DataFrame index.

In [27]:
plt.plot(years, f_total, color = 'red')
plt.title('Доли популярных имен в разрезе полов')
plt.xlabel('Года')
plt.ylabel('Доля в %')
plt.legend(['Ж'])
plt.show()



In [ ]:
f_total

In [ ]: