本培训教程,通过 jupyter-notebook 来指导怎么通过 Python Pandas 来进行数据分析和图表呈现。 包括下面这些内容:
我们用通过分析新冠疫情数据为例,来看看怎么从接口获取数据、数据整理和制作图表等。
其中会涉及到各类知识点,我们会尽量详细解释清楚。
我们先举一个完整的例子,从调用API接口开始,用 Pandas来处理基本数据,整个过程通过 Python 的 jupyter notebook 环境中实现。
In [3]:
# demo for infection/region
# input region, start_date, get data
# 接口:感染/国家地区
import requests
import pandas as pd
# API url
url = 'https://covid-19.adapay.tech/api/v1/'
# token, can call register function get the API token
token = '497115d0c2ff9586bf0fe03088cfdbe2'
# region or country
region='Italy'
# headers, need the API token
headers = {
'token': token
}
# the params
payload = {
'region': region,
'start_date':'2020-03-24'
}
# call requets to load
r = requests.get(url+'infection/region', params=payload, headers=headers)
data = r.json()
# use pandas to get the data
df = pd.DataFrame.from_dict(data['data']['region'][region])
print(df)
print('---')
In [2]:
# demo for infection/region
# input region, start_date, end_date, get data
# the params
payload = {
'region': region,
'start_date': '2020-03-24',
'end_date': '2020-03-31'
}
# call requets to load
r = requests.get(url+'infection/region', params=payload, headers=headers)
data = r.json()
# use pandas to get the data
df = pd.DataFrame.from_dict(data['data']['region'][region])
print(df)
print('---')
In [3]:
# demo for infection/region
# input region, start_date, end_date, get data
# exchange the row and column by Pandas, the row index is date
# 交换数据的行和列
df = df.T
print(df)
print('---')
In [4]:
# demo for infection/region
# input region, start_date, end_date, get data
# exchange the row and column by Pandas, the row index is date
# add calucate column, mortailty rate
df['mortality rate'] = df.apply(lambda x: x['deaths'] / x['confirmed'], axis=1)
print(df)
print('---')
In [5]:
# draw the line chart, for column confirmed and deaths
import requests
import pandas as pd
import matplotlib.pyplot as plt
# API url
url = 'https://covid-19.adapay.tech/api/v1/'
# token, can call register function get the API token
token = '497115d0c2ff9586bf0fe03088cfdbe2'
# region or country
region='Italy'
# headers, need the API token
headers = {
'token': token
}
# the params
payload = {
'region': region,
'start_date':'2020-03-24',
'end_date':'2020-03-31'
}
# call requets to load
r = requests.get(url+'infection/region', params=payload, headers=headers)
data = r.json()
# use pandas to get the data
df = pd.DataFrame.from_dict(data['data']['region'][region])
df = df.T
plt.figure()
df[['confirmed','deaths']].plot(kind='line')
Out[5]:
In [6]:
# draw the line chart, for column confirmed and deaths
plt.figure();
df[['confirmed_add','deaths_add','recovered_add']].plot(kind='bar')
Out[6]:
In [7]:
# demo for infection/region/detail
import requests
import pandas as pd
import matplotlib.pyplot as plt
# API url
url = 'https://covid-19.adapay.tech/api/v1/'
# token, can call register function get the API token
token = '497115d0c2ff9586bf0fe03088cfdbe2'
# region or country
region='US'
# headers, need the API token
headers = {
'token': token
}
# the params
payload = {
'region': region,
'start_date':'2020-03-24',
'end_date':'2020-03-31'
}
r = requests.get(url+'infection/region/detail', params=payload, headers=headers)
data = r.json()
df = pd.DataFrame.from_dict(data['data'])
df.head()
Out[7]:
In [8]:
df = pd.DataFrame.from_dict(data['data']['area'])
df.head()
Out[8]:
In [9]:
df = pd.DataFrame.from_dict(data['data']['area']['New York'])
df
Out[9]:
In [10]:
df = df.T
plt.figure();
df[['confirmed_add','deaths_add','recovered_add']].plot(kind='bar')
Out[10]:
In [11]:
df = pd.DataFrame.from_dict(data['data']['area']['California'])
df
Out[11]:
In [20]:
# demo for infection/global
import requests
import pandas as pd
import matplotlib.pyplot as plt
# API url
url = 'https://covid-19.adapay.tech/api/v1/'
# token, can call register function get the API token
token = '497115d0c2ff9586bf0fe03088cfdbe2'
# headers, need the API token
headers = {
'token': token
}
r = requests.get(url+'infection/global', headers=headers)
data = r.json()
df = pd.DataFrame.from_dict(data['data']['global']['region'])
df = df.T
df
Out[20]:
In [24]:
print(df.shape)
print(df.columns)
print(df.info())
In [ ]:
df = pd.DataFrame.from_dict(data)
In [ ]:
"""
@author:Bingo.he
@file: get_target_value.py
@time: 2017/12/22
"""
def get_target_value(key, dic, tmp_list):
"""
:param key: 目标key值
:param dic: JSON数据
:param tmp_list: 用于存储获取的数据
:return: list
"""
if not isinstance(dic, dict) or not isinstance(tmp_list, list): # 对传入数据进行格式校验
return 'argv[1] not an dict or argv[-1] not an list '
if key in dic.keys():
tmp_list.append(dic[key]) # 传入数据存在则存入tmp_list
for value in dic.values(): # 传入数据不符合则对其value值进行遍历
if isinstance(value, dict):
get_target_value(key, value, tmp_list) # 传入数据的value值是字典,则直接调用自身
elif isinstance(value, (list, tuple)):
_get_value(key, value, tmp_list) # 传入数据的value值是列表或者元组,则调用_get_value
return tmp_list
def _get_value(key, val, tmp_list):
for val_ in val:
if isinstance(val_, dict):
get_target_value(key, val_, tmp_list) # 传入数据的value值是字典,则调用get_target_value
elif isinstance(val_, (list, tuple)):
_get_value(key, val_, tmp_list) # 传入数据的value值是列表或者元组,则调用自身
list0=[]
print(get_target_value('New York',data,list0))
In [ ]:
url = 'https://covid-19.adapay.tech/api/v1/'
r = requests.post(url+'authentication/register', data = {'email':'wingfish@gmail.com'})
print(r)
In [ ]: