In [10]:
import pandas as pd
import os.path as path
"""
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
"""
Out[10]:
In [14]:
PATH = '/Users/martysyuk/Documents/Python 3 Coding/Repositorys/PY-3-Learning/homeworks/names/'
names = pd.read_csv(PATH + 'yob1981.txt', names=['Name', 'Gender', 'Count'])
names.head(10)
Out[14]:
In [4]:
names.query('Gender=="F" & Count > 5000').head(10)
Out[4]:
In [5]:
names[names.Gender=='M'].head(10)
Out[5]:
In [6]:
names[names.Gender=='M'].head(10).to_csv(PATH + 'done.txt', index=False)
In [7]:
names.sort_values(by='Count', ascending=False).head(10)
Out[7]:
In [8]:
print('Родилось {} мальчиков и {} девочек'.format(
names[names.Gender=='M'].Count.sum(),
names[names.Gender=='F'].Count.sum()
))
In [9]:
def count_to_len(row):
row.Count = len(row.Name)
return row
names.apply(count_to_len, axis=1).head(10)
Out[9]:
In [10]:
cols = ['Name', 'Gender', 'Count']
marge_on = ['Name', 'Gender']
names_1981 = pd.read_csv(PATH + 'yob1981.txt', names=cols)
names_1985 = pd.read_csv(PATH + 'yob1985.txt', names=cols)
pd.merge(names_1981, names_1985, on=marge_on, suffixes=('_1981', '_1985')).head(10)
Out[10]:
Слияние более 2-х файлов происходить поэтапно. Сначало переменной присваиватеся marge первых двух годов, потом в следующую переменную марджится переменная полученная при первом мардже и еще один файл.
In [11]:
names_1981 = pd.read_csv(PATH + 'yob1981.txt', names=cols)
names_1982 = pd.read_csv(PATH + 'yob1982.txt', names=cols)
names_1983 = pd.read_csv(PATH + 'yob1983.txt', names=cols)
names_81_82 = pd.merge(names_1981, names_1985, on=marge_on, suffixes=('_1981', '_1982')).head(10)
names_81_82_83 = pd.merge(names_81_82, names_1983,on=marge_on).head(10)
names_81_82_83
Out[11]:
In [12]:
names_all = pd.concat([names_1981, names_1982, names_1983], names=['Year', 'Pos'])
names_all
Out[12]:
In [13]:
def agg_count(row):
row.Count = row.Count_1981 + row.Count_1982 + row.Count
return row
names_81_82_83.apply(agg_count, axis=1).sort_values(by='Count', ascending=False).head(10)
Out[13]: