学习离散概率


In [ ]:
# https://plot.ly/python/discrete-frequency/

import plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

py.offline.init_notebook_mode(connected=True)

# 本节主要运用histnorm的参数: 'density|probability|percent|'

In [ ]:
# 通过控制bins的size来通过Histogram渐进展示分布

数据导入


In [ ]:
df_data = pd.read_csv(filepath_or_buffer='./2010_alcohol_consumption_by_country.csv')
nd_data = df_data.values

type(df_data), type(df_data.values)

fig = ff.create_table(table_text=df_data[0:10]) # nd_data

py.offline.iplot(fig)

alcohol_ss_data = df_data['alcohol']

type(alcohol_ss_data)

频数展示


In [ ]:
trace_hist = go.Histogram(
    x = alcohol_ss_data,
    xbins = dict(
        start = np.min(alcohol_ss_data),
        size = .5,
        end = np.max(alcohol_ss_data),
    ),
    histnorm = 'density', 
    name = 'trace-name-t1',
    marker = dict(
        color='#0000FF'
    ),
)

layout = go.Layout(
    title = 'layout-title-density',
)

fig = go.Figure(data = [trace_hist], layout = layout)
py.offline.iplot(fig)

概率分布


In [ ]:
trace_hist.histnorm = 'probability'
fig = go.Figure(data = [trace_hist], layout = layout)
fig.layout.title = 'layout-title-probability'
py.offline.iplot(fig)

百分比展示


In [ ]:
trace_hist.histnorm = 'percent'

fig = go.Figure(data = [trace_hist], layout = layout)
fig.layout.title = 'layout-title-percent'
py.offline.iplot(fig)

累积分布


In [ ]:
alcohol_ss_cum = np.cumsum(alcohol_ss_data)
print(np.min(alcohol_ss_data), np.max(alcohol_ss_data), alcohol_ss_data.size, alcohol_ss_cum.size)
print(alcohol_ss_data[0:5])
print("\n")
print(alcohol_ss_cum[0:5])

# 感觉这个例子有问题
trace = go.Scatter(
    x = [i for i in range(len(alcohol_ss_cum))],
    y = alcohol_ss_cum / alcohol_ss_cum.iloc[-1],
    marker = dict(
        color = '#0000FF',
    )
)

fig = go.Figure(data = [trace], layout=layout)
fig.layout.title = "CDF"
py.offline.iplot(fig)

In [ ]:
alcohol_ss_sort_data = alcohol_ss_data.sort_values()
print(alcohol_ss_sort_data[-10:])
alcohol_ss_sort_cum = np.cumsum(alcohol_ss_sort_data)

trace2 = go.Scatter(
    x = alcohol_ss_sort_data.tolist(),
    y = alcohol_ss_sort_cum / alcohol_ss_sort_cum.iloc[-1],
    marker = dict(
        color = '#0000FF',
    )
)

fig = go.Figure(data = [trace2], layout=layout)
fig.layout.title = "CDF2"
py.offline.iplot(fig)