Official documentation: http://pandas.pydata.org/pandas-docs/stable/
In [ ]:
%matplotlib inline
#%matplotlib notebook
from IPython.display import display
import matplotlib
matplotlib.rcParams['figure.figsize'] = (9, 9)
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime
import pandas as pd
import numpy as np
In [ ]:
pd.__version__
In [ ]:
data_list = [1, 3, np.nan, 7]
series = pd.Series(data_list)
series
In [ ]:
data_array = np.array(data_list)
series = pd.Series(data_array)
series
In [ ]:
indices = pd.Series([1, 3, 5, 7])
series = pd.Series([10, 30, 50, 70], index=indices)
series
In [ ]:
indices = pd.Series(['A', 'B', 'C', 'D'])
series = pd.Series([10, 30, 50, 70], index=indices)
series
In [ ]:
data_dict = {'A': 10, 'B': 30, 'C': 50, 'D': 70}
series = pd.Series(data_dict)
series
In [ ]:
series.index
In [ ]:
series.values
In [ ]:
series.shape
In [ ]:
series.dtypes
In [ ]:
series.describe()
In [ ]:
type(series.describe())
In [ ]:
series.memory_usage()
In [ ]:
dates = pd.date_range('20130101', periods=6)
dates
In [ ]:
dates = pd.date_range(start='2013-01-01', end='2013-01-08')
dates
In [ ]:
dates = pd.date_range('2013-01-01', periods=4, freq='M')
dates
In [ ]:
num_days = 7
data = np.random.random(num_days)
index = pd.date_range('2017-01-01', periods=num_days)
series = pd.Series(data, index)
series
In [ ]:
data_list = [[1, 2, 3], [4, 5, 6]]
df = pd.DataFrame(data_array)
df
In [ ]:
data_array = np.array([[1, 2, 3], [4, 5, 6]])
df = pd.DataFrame(data_array)
df
Using lists:
In [ ]:
data = [[1, 2, 3], [4, 5, 6]]
index = [10, 20]
columns = ['A', 'B', 'C']
df = pd.DataFrame(data, index, columns)
df
Using numpy arrays
:
In [ ]:
data = np.array([[1, 2, 3], [4, 5, 6]])
index = np.array([10, 20])
columns = np.array(['A', 'B', 'C'])
df = pd.DataFrame(data, index=index, columns=columns)
df
Using Series
:
In [ ]:
data = np.array([[1, 2, 3], [4, 5, 6]])
index = pd.Series([10, 20])
columns = pd.Series(['A', 'B', 'C'])
df = pd.DataFrame(data, index=index, columns=columns)
df
Dictionary keys define columns label.
In [ ]:
data_dict = {'A': 'foo',
'B': [10, 20, 30],
'C': 3}
df = pd.DataFrame(data_dict)
df
To define index as well:
In [ ]:
data_dict = {'A': 'foo',
'B': [10, 20, 30],
'C': 3}
df = pd.DataFrame(data_dict, index=[10, 20, 30])
df
In [ ]:
df.index
In [ ]:
df.columns
In [ ]:
df.values
In [ ]:
df.shape
In [ ]:
df.dtypes
In [ ]:
df.info()
In [ ]:
df.describe()
In [ ]:
type(df.describe())
In [ ]:
df.memory_usage()
DataFrame's columns can have different types. But what about rows ?
What append when a DataFrame with columns having different type is transposed ?
In [ ]:
data_dict = {'A': 'foo',
'B': [10, 20, 30],
'C': 3}
df = pd.DataFrame(data_dict)
df
In [ ]:
df.dtypes
In [ ]:
df2 = df.T
df2
In [ ]:
df2.dtypes
Panels are deprecated.
Pandas now focuses on 1D (Series
) and 2D (DataFrame
) data structures.
The recommended alternative to work with 3-dimensional data is the xarray python library.
An other workaround: one can simply use a MultiIndex DataFrame
for easily working with higher dimensional data.
See http://pandas.pydata.org/pandas-docs/stable/dsintro.html#deprecate-panel.
Panel4D
and PanelND
are deprecated.
Pandas now focuses on 1D (Series
) and 2D (DataFrame
) data structures.
The recommended alternative to work with n-dimensional data is the xarray python library.
An other workaround: one can simply use a MultiIndex DataFrame
for easily working with higher dimensional data.
See http://pandas.pydata.org/pandas-docs/stable/dsintro.html#panel4d-and-panelnd-deprecated.
Reader functions are accessibles from the top level pd
object.
Writer functions are accessibles from data objects (i.e. Series
, DataFrame
or Panel
objects).
In [ ]:
data_array = np.array([[1, 2, 3], [4, 5, 6]])
df = pd.DataFrame(data_array, index=[10, 20], columns=[100, 200, 300])
df
See python_pandas_hdf5_en.ipynb
...
Simplest version:
In [ ]:
df.to_csv(path_or_buf="python_pandas_io_test.csv")
In [ ]:
!cat python_pandas_io_test.csv
Setting more options:
In [ ]:
# FYI, many other options are available
df.to_csv(path_or_buf="python_pandas_io_test.csv",
sep=',',
columns=None,
header=True,
index=True,
index_label=None,
compression=None, # allowed values are 'gzip', 'bz2' or 'xz'
date_format=None)
In [ ]:
!cat python_pandas_io_test.csv
Simplest version:
In [ ]:
df = pd.read_csv("python_pandas_io_test.csv")
df
Setting more options:
In [ ]:
df = pd.read_csv("python_pandas_io_test.csv",
sep=',',
delimiter=None,
header='infer',
names=None,
index_col=0,
usecols=None,
squeeze=False,
prefix=None,
mangle_dupe_cols=True,
dtype=None,
engine=None,
converters=None,
true_values=None,
false_values=None,
skipinitialspace=False,
skiprows=None,
nrows=None,
na_values=None,
keep_default_na=True,
na_filter=True,
verbose=False,
skip_blank_lines=True,
parse_dates=False,
infer_datetime_format=False,
keep_date_col=False,
date_parser=None,
dayfirst=False,
iterator=False,
chunksize=None,
compression='infer',
thousands=None,
decimal=b'.',
lineterminator=None,
quotechar='"',
quoting=0,
escapechar=None,
comment=None,
encoding=None,
dialect=None,
#tupleize_cols=False,
error_bad_lines=True,
warn_bad_lines=True,
skipfooter=0,
#skip_footer=0,
doublequote=True,
delim_whitespace=False,
#as_recarray=False,
#compact_ints=False,
#use_unsigned=False,
low_memory=True,
#buffer_lines=None,
memory_map=False,
float_precision=None)
df
In [ ]:
!rm python_pandas_io_test.csv
In [ ]:
import io
In [ ]:
df.to_json(path_or_buf="python_pandas_io_test.json")
In [ ]:
!cat python_pandas_io_test.json
In [ ]:
df.to_json(path_or_buf="python_pandas_io_test_split.json",
orient="split")
In [ ]:
!cat python_pandas_io_test_split.json
In [ ]:
df.to_json(path_or_buf="python_pandas_io_test_records.json",
orient="records")
In [ ]:
!cat python_pandas_io_test_records.json
In [ ]:
df.to_json(path_or_buf="python_pandas_io_test_index.json",
orient="index")
In [ ]:
!cat python_pandas_io_test_index.json
In [ ]:
df.to_json(path_or_buf="python_pandas_io_test_columns.json",
orient="columns")
In [ ]:
!cat python_pandas_io_test_columns.json
In [ ]:
df.to_json(path_or_buf="python_pandas_io_test_values.json",
orient="values")
In [ ]:
!cat python_pandas_io_test_values.json
In [ ]:
# FYI, many other options are available
df.to_json(path_or_buf="python_pandas_io_test.json",
orient='columns', # For DataFrame: 'split','records','index','columns' or 'values'
date_format=None, # None, 'epoch' or 'iso'
double_precision=10,
force_ascii=True,
date_unit='ms')
In [ ]:
!cat python_pandas_io_test.json
Dict like data {index -> [index], columns -> [columns], data -> [values]}
In [ ]:
!cat python_pandas_io_test_split.json
In [ ]:
df = pd.read_json("python_pandas_io_test_split.json",
orient="split")
df
List like [{column -> value}, ... , {column -> value}]
In [ ]:
!cat python_pandas_io_test_records.json
In [ ]:
df = pd.read_json("python_pandas_io_test_records.json",
orient="records")
df
Dict like {index -> {column -> value}}
In [ ]:
!cat python_pandas_io_test_index.json
In [ ]:
df = pd.read_json("python_pandas_io_test_index.json",
orient="index")
df
Dict like {column -> {index -> value}}
In [ ]:
!cat python_pandas_io_test_columns.json
In [ ]:
df = pd.read_json("python_pandas_io_test_columns.json",
orient="columns")
df
Just the values array
In [ ]:
!cat python_pandas_io_test_values.json
In [ ]:
df = pd.read_json("python_pandas_io_test_values.json",
orient="values")
df
In [ ]:
df = pd.read_json("python_pandas_io_test.json",
orient=None,
typ='frame',
dtype=True,
convert_axes=True,
convert_dates=True,
keep_default_dates=True,
numpy=False,
precise_float=False,
date_unit=None,
encoding=None,
lines=False)
df
In [ ]:
!rm python_pandas_io_test*.json
In [ ]:
!echo "- {A: 1, B: 2}" > python_pandas_io_test.yaml
!echo "- {A: 3}" >> python_pandas_io_test.yaml
!echo "- {B: 4}" >> python_pandas_io_test.yaml
In [ ]:
!cat python_pandas_io_test.yaml
In [ ]:
try:
import yaml
with open('python_pandas_io_test.yaml', 'r') as f:
df = pd.io.json.json_normalize(yaml.load(f))
print(df)
except:
pass
In [ ]:
!rm python_pandas_io_test.yaml
Many other file formats can be used to import or export data with JSON.
See the following link for more information: http://pandas.pydata.org/pandas-docs/stable/io.html
In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
index=np.arange(1, 10, 1),
columns=['A', 'B', 'C'])
df
The following instructions return a Series
.
In [ ]:
df.B
In [ ]:
df["B"]
In [ ]:
df.loc[:,"B"]
In [ ]:
df.iloc[:,1]
In [ ]:
df[['A','B']]
In [ ]:
df.loc[:,['A','B']]
In [ ]:
df.iloc[:,0:2]
In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
index=["i" + str(i+1) for i in range(9)],
columns=['A', 'B', 'C'])
df
The following instructions return a Series
.
In [ ]:
df.loc["i3"]
In [ ]:
df.loc["i3",:]
In [ ]:
df.iloc[2] # Select over index
In [ ]:
df.iloc[2,:] # Select over index
In [ ]:
df.loc[["i3", "i4"],:]
In [ ]:
df.iloc[2:4,:] # Select over index
In [ ]:
df.B < 50.
In [ ]:
type(df.B < 50.)
In [ ]:
df[[True, True, True, True, False, False, False, False, False]]
In [ ]:
series_mask = pd.Series({'i1': True,
'i2': True,
'i3': True,
'i4': True,
'i5': False,
'i6': False,
'i7': False,
'i8': False,
'i9': False})
df[series_mask]
In [ ]:
df[df.B < 50.]
In [ ]:
df[df['B'] < 50.]
In [ ]:
df[(df.A >= 2) & (df.B < 50)]
This can be written:
In [ ]:
df.loc[(df.A >= 2) & (df.B < 50)]
This could be written df[df.A >= 2][df.B < 50]
but this is a bad practice (named "chained indexing").
"When setting values in a pandas object, care must be taken to avoid what is called chained indexing".
See:
In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
index=np.arange(1, 10, 1),
columns=['A', 'B', 'C'])
df
In [ ]:
df[(df.A >= 2) & (df.B < 50)]
In [ ]:
df[(df.B < 20) | (df.B > 50)]
In [ ]:
df.loc[(df.B < 20) | (df.B > 50), 'C']
In [ ]:
df[(df['A'] >= 2) & (df['B'] < 50)]
In [ ]:
df.loc[(df.A >= 2) & (df.B < 50), ['A','B']]
In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
index=np.arange(1, 10, 1),
columns=['A', 'B', 'C'])
df
In [ ]:
df.B *= 2.
df
In [ ]:
df.B = pow(df.B, 2)
df
In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
index=np.arange(1, 10, 1),
columns=['A', 'B', 'C'])
df
In [ ]:
df[df.B < 50.] *= -1.
df
In [ ]:
# df['B'][df['B'] < 50.] = 0 # OK but chain indexing is bad...
# df.A[df.B < 50.] = 0 # OK but chain indexing is bad...
df.loc[df.B < 50., 'A'] = 0
df
WARNING: df[df.B < 50.].A = 0
does NOT work even if df.A[df.B < 50.]
and df[df.B < 50.].A
seems to produce the same result...
"When setting values in a pandas object, care must be taken to avoid what is called chained indexing".
See:
In [ ]:
df.loc[(df.B < 50.) & (df.B > 20), 'C'] = 0
df
In [ ]:
df.loc[(df.B < 20) | (df.B > 50), 'C'] = -1
df
In [ ]:
df[df.B < 50.] = pow(df[df.B < 50.], 2)
df
In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
index=np.arange(1, 10, 1),
columns=['A', 'B', 'C'])
df
Draw 3 samples:
In [ ]:
df.sample(n=30, replace=True)
Sample 90% of the rows:
In [ ]:
df.sample(frac=0.9, replace=True)
Draw 3 samples:
In [ ]:
df.sample(n=3)
Sample 90% of the rows:
In [ ]:
df.sample(frac=0.9)
In [ ]:
df.sample(n=30, replace=True, weights=np.arange(len(df)))
In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
index=np.arange(1, 10, 1),
columns=['A', 'B', 'C'])
df
In [ ]:
df = df.sample(frac=1)
df
To reset indexes too:
In [ ]:
df = df.sample(frac=1).reset_index(drop=True)
df
In [ ]:
NROWS = 7
col1 = np.arange(1., NROWS, 1)
col2 = np.arange(10., NROWS*10, 10)
col3 = np.arange(100., NROWS*100, 100)
np.random.shuffle(col1)
np.random.shuffle(col2)
np.random.shuffle(col3)
data = np.array([col1,
col2,
col3]).T
index = np.arange(1, NROWS, 1)
columns = np.array(['A', 'B', 'C'])
np.random.shuffle(index)
np.random.shuffle(data)
np.random.shuffle(columns)
df = pd.DataFrame(data,
index=index,
columns=columns)
df
In [ ]:
df.sort_index()
In [ ]:
df.sort_index(axis=0) # axis=0 -> sort by row index
In [ ]:
df.sort_index(ascending=False)
In [ ]:
df.sort_index(axis=1) # axis=1 -> sort by column label
In [ ]:
df.sort_index(axis=1, ascending=False)
In [ ]:
df.sort_values(by='B')
In [ ]:
df.sort_values(by='B', ascending=False)
In [ ]:
df.sort_values(by='B', inplace=True)
df
In [ ]:
a = np.array([[3, np.nan, 5, np.nan, 7],
[2, 4, np.nan, 3, 1],
[3, 4, 5, 6, 1]]).T
df = pd.DataFrame(a,
columns=['A', 'B', 'C'])
df
In [ ]:
df.isnull()
In [ ]:
df.dropna()
In [ ]:
df.dropna(how='any') # but 'any' is the default value...
In [ ]:
df.dropna(subset=['B'])
In [ ]:
df.dropna(subset=['B', 'C'])
In [ ]:
df.dropna(axis=1)
In [ ]:
df.dropna(axis=1, how='any') # but 'any' is the default value...
In [ ]:
df.dropna(axis=1, subset=[2])
In [ ]:
df.dropna(axis=1, subset=[1, 2])
In [ ]:
df.fillna(value=999)
In [ ]:
df.A.isnull().sum()
In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
index=np.arange(1, 10, 1),
columns=['A', 'B', 'C'])
df
In [ ]:
df.T
In [ ]:
a1 = np.array([np.arange(1, 5, 1), np.arange(10, 50, 10), np.arange(100, 500, 100)]).T
df1 = pd.DataFrame(a1,
columns=['ID', 'B', 'C'])
a2 = np.array([np.arange(1, 5, 1), np.arange(1000, 5000, 1000), np.arange(10000, 50000, 10000)]).T
df2 = pd.DataFrame(a2,
columns=['ID', 'B', 'C'])
display(df1)
display(df2)
df = pd.merge(df1, df2, on="ID", suffixes=('_1', '_2')) #.dropna(how='any')
display(df)
In [ ]:
a1 = np.array([np.arange(1, 5, 1), np.arange(10, 50, 10), np.arange(100, 500, 100)]).T
df1 = pd.DataFrame(a1,
columns=['ID', 'B', 'C'])
a2 = np.array([np.arange(1, 5, 1), np.arange(1000, 5000, 1000), np.arange(10000, 50000, 10000)]).T
df2 = pd.DataFrame(a2,
columns=['ID', 'B', 'C'])
df1.iloc[0,2] = np.nan
df1.iloc[1,1] = np.nan
df1.iloc[2,2] = np.nan
df1.iloc[3,1] = np.nan
df2.iloc[0,1] = np.nan
df2.iloc[1,2] = np.nan
df2.iloc[2,1] = np.nan
df2.iloc[3,2] = np.nan
df = pd.merge(df1, df2, on="ID", suffixes=('_1', '_2')) #.dropna(how='any')
display(df1)
display(df2)
display(df)
In [ ]:
a1 = np.array([np.arange(1, 5, 1), np.arange(10, 50, 10), np.arange(100, 500, 100)]).T
df1 = pd.DataFrame(a1,
columns=['ID', 'B', 'C'])
a2 = np.array([np.arange(1, 3, 1), np.arange(1000, 3000, 1000), np.arange(10000, 30000, 10000)]).T
df2 = pd.DataFrame(a2,
columns=['ID', 'B', 'C'])
display(df1)
display(df2)
print("Left: use only keys from left frame (SQL: left outer join)")
df = pd.merge(df1, df2, on="ID", how="left", suffixes=('_1', '_2')) #.dropna(how='any')
display(df)
print("Right: use only keys from right frame (SQL: right outer join)")
df = pd.merge(df1, df2, on="ID", how="right", suffixes=('_1', '_2')) #.dropna(how='any')
display(df)
print("Inner: use intersection of keys from both frames (SQL: inner join) [DEFAULT]")
df = pd.merge(df1, df2, on="ID", how="inner", suffixes=('_1', '_2')) #.dropna(how='any')
display(df)
print("Outer: use union of keys from both frames (SQL: full outer join)")
df = pd.merge(df1, df2, on="ID", how="outer", suffixes=('_1', '_2')) #.dropna(how='any')
display(df)
In [ ]:
a = np.array([[3, 5, 5, 5, 7, 7, 7, 7],
[2, 4, 4, 3, 1, 3, 3, 2],
[3, 4, 5, 6, 1, 8, 9, 8]]).T
df = pd.DataFrame(a,
columns=['A', 'B', 'C'])
df
In [ ]:
df.groupby(["A"]).count()
In [ ]:
df.groupby(["A"]).sum().B
In [ ]:
df.groupby(["A"]).mean().B
In [ ]:
df.groupby(["A","B"]).count()
Rolling with an aggregation window of size 2.
In [ ]:
s = pd.Series([1., 0., 5., 2., 1.])
print("DATA:")
print(s)
mean_s = s.rolling(2).mean()
print()
print("ROLLING MEAN:")
print(mean_s)
sum_s = s.rolling(2).sum()
print()
print("ROLLING SUM:")
print(sum_s)
min_s = s.rolling(2).min()
print()
print("ROLLING MIN:")
print(min_s)
max_s = s.rolling(2).max()
print()
print("ROLLING MAX:")
print(max_s)
ax = s.plot(figsize=(18, 3), color="blue")
mean_s.plot(color="red", label="mean", ax=ax)
sum_s.plot(color="green", label="sum", style="--", alpha=0.5, ax=ax)
min_s.plot(color="black", label="min", style=":", alpha=0.25, ax=ax)
max_s.plot(color="black", label="max", style=":", alpha=0.25, ax=ax)
ax.legend();
In [ ]:
index = np.arange(0, 20, 0.05)
s = pd.Series(np.sin(index))
s = s + np.random.normal(scale=0.4, size=s.shape)
ax = s.plot(figsize=(18, 3))
In [ ]:
s.shape
Rolling with an aggregation window of size 20.
In [ ]:
s_mean = s.rolling(20).mean()
s_median = s.rolling(20).median()
s_min = s.rolling(20).min()
s_max = s.rolling(20).max()
ax = s_mean.plot(y='duration', figsize=(18, 8), color="red", label="mean", alpha=0.75)
s_median.plot(ax=ax, color="blue", label="median", alpha=0.75)
s_min.plot(ax=ax, color="blue", alpha=0.5, style=":", label="min")
s_max.plot(ax=ax, color="blue", alpha=0.5, style=":", label="max")
plt.fill_between(s_min.index, s_min.values, s_max.values, facecolor='blue', alpha=0.1)
ax.legend()
ax.set_xlabel('Time');
In [ ]:
s_mean.shape
In [ ]:
df = pd.DataFrame([["i1", "A", 1],
["i1", "B", 2],
["i2", "A", 3],
["i2", "B", 4]], columns=["foo", "bar", "baz"])
df
In [ ]:
df.pivot(index="foo", columns="bar", values="baz")
In [ ]:
a = np.array([[3, 5, 5, 5, 7, 7, 7, 7],
[2, 4, 4, 3, 1, 3, 3, 2],
[3, 4, 5, 6, 1, 8, 9, 8]]).T
df = pd.DataFrame(a,
columns=['A', 'B', 'C'])
df
In [ ]:
df.A.value_counts()
In [ ]:
df.A.value_counts().plot.bar()
In [ ]:
df = pd.DataFrame(np.random.normal(size=100000))
In [ ]:
df.quantile(0.50)
In [ ]:
df.quantile([0.25, 0.75])
In [ ]:
df.quantile([0.01, 0.001])
There are 3 main time related types in Pandas (and the equivalent type for Series
and DataFrame
indices):
pandas.Timestamp
(pandas.DatetimeIndex
for indices): pandas equivalent of python's datetime.datetime
pandas.Period
(pandas.PeriodIndex
for indices): represents a period of timepandas.Timedelta
(pandas.TimedeltaIndex
for indices): represents a duration (the difference between two dates or times) i.e. the pandas equivalent of python's datetime.timedelta
A Timestamp is a point in time:
In [ ]:
pd.Timestamp(year=2018, month=1, day=1, hour=12, minute=30)
A Period is a range in time (with a "anchored" start time and a "anchored" end time):
In [ ]:
p = pd.Period(freq='D', year=2018, month=1, day=1, hour=12, minute=30)
print(p)
print("Start time:", p.start_time)
print("End time:", p.end_time)
A Timedelta is a "floating" duration (i.e. not "anchored" in time):
In [ ]:
print(pd.Timedelta(days=5, seconds=30))
ts1 = pd.Timestamp(year=2018, month=1, day=1, hour=12, minute=30)
ts2 = pd.Timestamp(year=2018, month=1, day=2, hour=12, minute=30)
print(ts2 - ts1)
In [ ]:
pd.date_range('2018-01-01', '2018-03-01', freq='D')
In [ ]:
pd.date_range('2018-01-01', periods=10, freq='h')
In [ ]:
pd.date_range('1/1/2012', periods=10, freq='S')
In [ ]:
pd.date_range('3/6/2012 00:00', periods=5, freq='D')
In [ ]:
pd.date_range('1/1/2012', periods=5, freq='M')
In [ ]:
pd.period_range('2018-01-01', '2018-03-01', freq='D')
In [ ]:
pd.date_range('2018-01-01', '2018-03-01', freq='D').to_period()
In [ ]:
dti = pd.date_range('2012-01-01 00:00', periods=40, freq='D')
ts = pd.Series(np.random.randint(0, 200, len(dti)), index=dti)
In [ ]:
ts.plot();
In [ ]:
ts.plot(x_compat=True);
In [ ]:
dti = pd.date_range('2018-01-01 00:00', '2018-01-03 00:00', freq='H')
ts = pd.Series(np.random.randint(0, 100, len(dti)), index=dti)
ax = ts.plot(x_compat=True, figsize=(16, 4)) # x_compat is required as matplotlib doesn't understand pandas datetime format -> x_compat=True makes the conversion...
# set monthly locator
ax.xaxis.set_major_locator(mdates.DayLocator(interval=1))
ax.xaxis.set_minor_locator(mdates.HourLocator(interval=1))
# set formatter
ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y'))
# set font and rotation for date tick labels
plt.gcf().autofmt_xdate()
In [ ]:
dti = pd.date_range('2012-1-1 00:00', periods=40, freq='D')
ts = pd.Series(np.random.randint(0, 200, len(dti)), index=dti)
In [ ]:
ts
In [ ]:
ts["2012-01-09"]
In [ ]:
ts[datetime.datetime(2012, 1, 9)]
In [ ]:
ts[ts.index < "2012-01-09"]
In [ ]:
ts[ts.index > "2012-01-20"]
In [ ]:
ts["2012-01-09":"2012-01-20"]
In [ ]:
ts[datetime.datetime(2012, 1, 9):datetime.datetime(2012, 1, 20)]
In [ ]:
ts[ts.index.day <= 3]
In [ ]:
ts[ts.index.month == 2]
In [ ]:
ts["2012-02"]
In [ ]:
ts[ts.index.dayofweek == 1]
Rolling window size: 1 day
In [ ]:
dti = pd.DatetimeIndex(['2018-1-1 00:00', '2018-1-1 06:45', '2018-1-1 12:00',
'2018-1-2 00:00', '2018-1-2 06:00', '2018-1-2 12:00'])
ts = pd.Series([2., 1., 3., 2., 2., 0.], index=dti)
print("DATA:")
print(ts)
ax = ts.plot(figsize=(18, 3), style="*-", color="blue")
ax.vlines(pd.DatetimeIndex(['2018-1-1 00:00', '2018-1-2 00:00']), ymin=0, ymax=8, color="red", linestyle=":", alpha=0.3);
ts_rw = ts.rolling('D').sum() # Rolling window size: 1 day
print()
print("MEAN:")
print(ts_rw)
ts_rw.plot(color="red", label="sum", style="*-", alpha=0.75, ax=ax)
ax.legend()
ax.set_xlabel('Time')
ax.grid(True);
In [ ]:
ts.rolling('6h').min()
In [ ]:
ts.rolling('3h').mean()
In [ ]:
dti = pd.date_range('1/1/2018 00:00', periods=6*480, freq='10min')
ts = pd.Series(np.sin(dti.hour * 2. * np.pi / 24.), index=dti)
ts = ts + np.random.normal(scale=0.4, size=ts.shape)
ax = ts.plot(figsize=(18, 3))
ax.vlines(pd.date_range('1/1/2018 00:00', periods=480/24, freq='D'), ymin=-2, ymax=2, color="red", linestyle=":", alpha=0.3);
In [ ]:
ts.shape
In [ ]:
ts_mean = ts.rolling('5H').mean()
ts_median = ts.rolling('5H').median()
ts_min = ts.rolling('5H').min()
ts_max = ts.rolling('5H').max()
ax = ts_mean.plot(y='duration', figsize=(18, 3), color="red", label="mean", alpha=0.75)
ts_median.plot(ax=ax, color="blue", label="median", alpha=0.75)
ts_min.plot(ax=ax, color="blue", alpha=0.5, style=":", label="min")
ts_max.plot(ax=ax, color="blue", alpha=0.5, style=":", label="max")
plt.fill_between(ts_min.index, ts_min.values, ts_max.values, facecolor='blue', alpha=0.1)
ax.legend()
ax.set_xlabel('Time');
In [ ]:
ts_mean.shape
resample()
is a time-based groupby, followed by a reduction method on each of its groups.
resample()
is similar to using a rolling()
operation with a time-based offset.
In [ ]:
dti = pd.DatetimeIndex(['2018-1-1 00:00', '2018-1-1 06:45', '2018-1-1 12:00',
'2018-1-2 00:00', '2018-1-2 12:00'])
ts = pd.Series([1., 0., 5., 2., 0.], index=dti)
print("DATA:")
print(ts)
ax = ts.plot(figsize=(18, 3), style="*-", color="blue")
ax.vlines(pd.DatetimeIndex(['2018-1-1 00:00', '2018-1-2 00:00']), ymin=0, ymax=5, color="red", linestyle=":", alpha=0.3);
ts_resampled = ts.resample('D').mean()
print()
print("MEAN:")
print(ts_resampled)
ts_resampled.plot(color="red", style="*-", label="mean", alpha=0.75, ax=ax)
ax.legend()
ax.set_xlabel('Time');
In [ ]:
ts.resample('6h').min()
In [ ]:
ts.resample('3h').sum()
No. resample('D')
aggregates values for each day between "00:00:00" and "23:59:59", whatever the first index of ts
.
See the examples bellow...
In [ ]:
dti = pd.DatetimeIndex(['2018-1-1 12:00',
'2018-1-2 08:00', '2018-1-2 18:00', '2018-1-2 23:59:59',
'2018-1-3 00:00'])
ts = pd.Series([0.,
10., 20., 30.,
5.], index=dti)
print("DATA:")
print(ts)
ts_resampled = ts.resample('D').mean()
print()
print("MEAN:")
print(ts_resampled)
# Illustrative plot
ax = ts.plot(x_compat=True, figsize=(18, 3), style="*-", color="blue")
ax.vlines(pd.DatetimeIndex(['2018-1-1 00:00', '2018-1-2 00:00', '2018-1-3 00:00']), ymin=-10, ymax=40, color="red", linestyle=":", linewidth=2, alpha=0.5);
ax.vlines(pd.DatetimeIndex(['2018-1-1 12:00', '2018-1-2 12:00', '2018-1-3 12:00']), ymin=-10, ymax=40, color="green", linestyle=":", linewidth=2, alpha=0.5);
ax.plot(pd.DatetimeIndex(['2018-1-1 12:15', '2018-1-2 11:45']), [40, 40], marker="|", markersize=20, color="green")
ax.plot(pd.DatetimeIndex(['2018-1-2 12:15', '2018-1-3 11:45']), [40, 40], marker="|", markersize=20, color="green")
ax.plot(pd.DatetimeIndex(['2018-1-1 00:15', '2018-1-1 23:45']), [35, 35], marker="|", markersize=20, color="red")
ax.plot(pd.DatetimeIndex(['2018-1-2 00:15', '2018-1-2 23:45']), [35, 35], marker="|", markersize=20, color="red")
ts_resampled.plot(color="red", style="*-", label="mean", alpha=0.75, ax=ax)
# set monthly locator
ax.xaxis.set_major_locator(mdates.DayLocator(interval=1))
ax.xaxis.set_minor_locator(mdates.HourLocator(interval=1))
# set formatter
ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y %H:%M'))
# set font and rotation for date tick labels
plt.gcf().autofmt_xdate()
ax.legend()
ax.set_xlabel('Time');
In [ ]:
dti = pd.DatetimeIndex(['2018-1-1 01:00',
'2018-1-1 05:30', '2018-1-1 07:30',
'2018-1-1 10:00'])
ts = pd.Series([0.,
10., 20.,
5.], index=dti)
print("DATA:")
print(ts)
ts_resampled = ts.resample('5h').mean()
print()
print("MEAN:")
print(ts_resampled)
# Illustrative plot
ax = ts.plot(x_compat=True, figsize=(18, 3), style="*-", color="blue")
ax.vlines(pd.DatetimeIndex(['2018-1-1 00:00', '2018-1-1 05:00', '2018-1-1 10:00']), ymin=-10, ymax=40, color="red", linestyle=":", linewidth=2, alpha=0.5);
ax.vlines(pd.DatetimeIndex(['2018-1-1 01:00', '2018-1-1 06:00', '2018-1-1 11:00']), ymin=-10, ymax=40, color="green", linestyle=":", linewidth=2, alpha=0.5);
ax.plot(pd.DatetimeIndex(['2018-1-1 01:05', '2018-1-1 05:55']), [40, 40], marker="|", markersize=20, color="green")
ax.plot(pd.DatetimeIndex(['2018-1-1 06:05', '2018-1-1 10:55']), [40, 40], marker="|", markersize=20, color="green")
ax.plot(pd.DatetimeIndex(['2018-1-1 00:05', '2018-1-1 04:55']), [35, 35], marker="|", markersize=20, color="red")
ax.plot(pd.DatetimeIndex(['2018-1-1 05:05', '2018-1-1 09:55']), [35, 35], marker="|", markersize=20, color="red")
ts_resampled.plot(color="red", style="*-", label="mean", alpha=0.75, ax=ax)
# set monthly locator
ax.xaxis.set_major_locator(mdates.HourLocator(interval=1))
#ax.xaxis.set_minor_locator(mdates.HourLocator(interval=1))
# set formatter
ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
# set font and rotation for date tick labels
plt.gcf().autofmt_xdate()
ax.legend()
ax.set_xlabel('Time');
In [ ]:
dti = pd.date_range('1/1/2018 00:00', periods=60*480, freq='min')
ts = pd.Series(np.sin(dti.hour * 2. * np.pi / 24.), index=dti)
ts = ts + np.random.normal(scale=0.4, size=ts.shape)
ax = ts.plot(figsize=(18, 3))
ax.vlines(pd.date_range('1/1/2018 00:00', periods=480/24, freq='D'), ymin=-2, ymax=2, color="red", linestyle=":", alpha=0.3);
In [ ]:
ts.shape
In [ ]:
ts_mean = ts.resample('2H').mean()
ts_median = ts.resample('2H').median()
ts_min = ts.resample('2H').min()
ts_max = ts.resample('2H').max()
ax = ts_mean.plot(y='duration', figsize=(18, 8), color="red", label="mean", alpha=0.75)
ts_median.plot(ax=ax, color="blue", label="median", alpha=0.75)
ts_min.plot(ax=ax, color="blue", alpha=0.5, style=":", label="min")
ts_max.plot(ax=ax, color="blue", alpha=0.5, style=":", label="max")
plt.fill_between(ts_min.index, ts_min.values, ts_max.values, facecolor='blue', alpha=0.1)
ax.legend()
ax.set_xlabel('Time');
In [ ]:
ts_mean.shape
In [ ]:
rolling_window = '6H'
start = '2018-1-1 00:00'
end = '2018-1-4 00:00'
In [ ]:
dti = pd.date_range(start=start, end=end, freq='min')
ts = pd.Series(np.sin(dti.hour * 2. * np.pi / 24.), index=dti)
ts = ts + np.random.normal(scale=0.4, size=ts.shape)
ax = ts.plot(figsize=(18, 3))
ax.vlines(pd.date_range(start=start, end=end, freq=rolling_window), ymin=-2, ymax=2, color="red", linestyle=":", alpha=0.5);
In [ ]:
ts2 = ts.rolling(rolling_window).mean() # Rolling window size: 1 day
ax = ts2.plot(figsize=(18, 3), color="red", alpha=0.75)
ax.vlines(pd.date_range(start=start, end=end, freq=rolling_window), ymin=-1, ymax=1, color="red", linestyle=":", alpha=0.5);
In [ ]:
ts2 = ts.resample(rolling_window).mean() # Rolling window size: 1 day
ax = ts2.plot(figsize=(18, 3), color="red", alpha=0.75)
ax.vlines(pd.date_range(start=start, end=end, freq=rolling_window), ymin=-1, ymax=1, color="red", linestyle=":", alpha=0.5);
In [ ]:
dti = pd.DatetimeIndex(['2018-1-1 00:00', '2018-1-1 12:00', '2018-1-2 00:00', '2018-1-2 12:00'])
ts = pd.Series([1., 0., 2., 1.], index=dti)
print(ts)
ax = ts.plot(figsize=(18, 3))
dti = pd.DatetimeIndex(['2018-1-1 00:00', '2018-1-2 00:00'])
ax.vlines(dti, ymin=0, ymax=2, color="red", linestyle=":", alpha=0.3);
In [ ]:
ts_mean = ts.groupby(ts.index.time).mean()
print(ts_mean)
ax = ts_mean.plot(y='duration', figsize=(10, 4), color="red", label="mean", alpha=0.75)
ax.legend()
ax.set_xlabel('Time');
In [ ]:
dti = pd.DatetimeIndex(['2018-1-1 00:00', '2018-1-1 12:00', '2018-1-2 00:31', '2018-1-2 12:25']) # Here time is not aligned (non constant frequency)
ts = pd.Series([1., 0., 2., 1.], index=dti)
print(ts)
ax = ts.plot(figsize=(18, 3));
In [ ]:
ts_mean = ts.groupby(ts.index.time).mean()
print(ts_mean)
ax = ts_mean.plot(y='duration', figsize=(10, 4), color="red", label="mean", alpha=0.75)
ax.legend()
ax.set_xlabel('Time');
In [ ]:
dti = pd.date_range('1/1/2018 00:00', periods=960, freq='h')
ts = pd.Series(np.sin(dti.hour * 2. * np.pi / 24.), index=dti)
ts = ts + np.random.normal(scale=0.4, size=ts.shape)
ax = ts.plot(figsize=(18, 3))
ax.vlines(pd.date_range('1/1/2018 00:00', periods=960/24, freq='D'), ymin=-2, ymax=2, color="red", linestyle=":", alpha=0.3);
In [ ]:
ts_mean = ts.groupby(ts.index.time).mean()
ts_median = ts.groupby(ts.index.time).median()
ts_quartile_1 = ts.groupby(ts.index.time).quantile(0.25)
ts_quartile_3 = ts.groupby(ts.index.time).quantile(0.75)
ax = ts_mean.plot(y='duration', figsize=(14, 8), color="red", label="mean", alpha=0.75)
ts_median.plot(ax=ax, color="blue", label="median", alpha=0.75)
ts_quartile_1.plot(ax=ax, color="blue", alpha=0.5, style=":", label="1st quartile")
ts_quartile_3.plot(ax=ax, color="blue", alpha=0.5, style=":", label="3rd quartile")
plt.fill_between(ts_quartile_1.index, ts_quartile_1.values, ts_quartile_3.values, facecolor='blue', alpha=0.1)
ax.legend()
ax.set_xlabel('Time');
In [ ]:
dti = pd.period_range('1/1/2018 00:00', periods=960, freq='h')
ts = pd.Series(np.sin(dti.hour * 2. * np.pi / 24.), index=dti)
ts = ts + np.random.normal(scale=0.4, size=ts.shape)
ax = ts.plot(figsize=(18, 3))
ax.vlines(pd.date_range('1/1/2018 00:00', periods=960/24, freq='D'), ymin=-2, ymax=2, color="red", linestyle=":", alpha=0.3);
In [ ]:
ts_mean = ts.groupby(ts.index.start_time.time).mean() # Note the ".start_time" here
ts_median = ts.groupby(ts.index.start_time.time).median() # Note the ".start_time" here
ts_quartile_1 = ts.groupby(ts.index.start_time.time).quantile(0.25) # Note the ".start_time" here
ts_quartile_3 = ts.groupby(ts.index.start_time.time).quantile(0.75) # Note the ".start_time" here
ax = ts_mean.plot(y='duration', figsize=(14, 8), color="red", label="mean", alpha=0.75)
ts_median.plot(ax=ax, color="blue", label="median", alpha=0.75)
ts_quartile_1.plot(ax=ax, color="blue", alpha=0.5, style=":", label="1st quartile")
ts_quartile_3.plot(ax=ax, color="blue", alpha=0.5, style=":", label="3rd quartile")
plt.fill_between(ts_quartile_1.index, ts_quartile_1.values, ts_quartile_3.values, facecolor='blue', alpha=0.1)
ax.legend()
ax.set_xlabel('Time');
In [ ]:
dti = pd.DatetimeIndex(['2018-1-1 00:00', '2018-1-1 12:00', '2018-1-2 00:31', '2018-1-2 12:25']) # Here time is not aligned (non constant frequency)
ts = pd.Series([1., 0., 2., 1.], index=dti)
print(ts)
In [ ]:
ts.index.round('H')
TODO: is it the cleanest way to do this ?
In [ ]:
dti = pd.DatetimeIndex(['2018-1-1 00:00', '2018-1-1 06:45', '2018-1-1 12:00',
'2018-1-3 00:00', '2018-1-3 06:00'])
s = pd.Series(np.ones(dti.shape), index=dti)
In [ ]:
#dti.groupby(dti.date) # it works but it returns a dictionary...
In [ ]:
s.groupby(dti.date).count().plot.bar(color="blue", alpha=0.5);
In [ ]:
s.resample('1d').count().plot.bar(color="blue", alpha=0.5);
In [ ]:
#help(df.plot)
In [ ]:
x = np.arange(0, 6, 0.1)
y1 = np.cos(x)
y2 = np.sin(x)
Y = np.array([y1, y2]).T
df = pd.DataFrame(Y,
columns=['cos(x)', 'sin(x)'],
index=x)
df.iloc[:10]
In [ ]:
df.plot(legend=True)
or
In [ ]:
df.plot.line(legend=True)
In [ ]:
df = pd.DataFrame(np.random.randn(36, 2))
df.plot(drawstyle="steps", linewidth=2)
df.plot(drawstyle="steps-post", linewidth=2);
In [ ]:
x = np.arange(0, 6, 0.5)
y1 = np.cos(x)
y2 = np.sin(x)
Y = np.array([y1, y2]).T
df = pd.DataFrame(Y,
columns=['cos(x)', 'sin(x)'],
index=x)
df
In [ ]:
df.plot.bar(legend=True)
In [ ]:
df.plot.bar(legend=True, stacked=True)
In [ ]:
df.plot.barh(legend=True)
In [ ]:
x1 = np.random.normal(size=(10000))
x2 = np.random.normal(loc=3, scale=2, size=(10000))
X = np.array([x1, x2]).T
df = pd.DataFrame(X, columns=[r'$\mathcal{N}(0,1)$', r'$\mathcal{N}(3,2)$'])
df.plot.hist(alpha=0.5, bins=100, legend=True);
To normalize the $y$ axis, use density=True
:
In [ ]:
df.plot.hist(alpha=0.5, bins=100, legend=True, density=True);
In [ ]:
x1 = np.random.normal(size=(10000))
x2 = np.random.normal(loc=3, scale=2, size=(10000))
X = np.array([x1, x2]).T
df = pd.DataFrame(X, columns=[r'$\mathcal{N}(0,1)$', r'$\mathcal{N}(3,2)$'])
df.plot.box()
In [ ]:
df = pd.DataFrame(np.random.randn(1000, 2), columns=['a', 'b'])
df['b'] = df['b'] + np.arange(1000)
df.plot.hexbin(x='a', y='b', gridsize=25)
In [ ]:
x1 = np.random.normal(size=(10000))
x2 = np.random.normal(loc=3, scale=2, size=(10000))
X = np.array([x1, x2]).T
df = pd.DataFrame(X, columns=[r'$\mathcal{N}(0,1)$', r'$\mathcal{N}(3,2)$'])
df.plot.kde()
In [ ]:
df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])
df.plot.area()
In [ ]:
x = np.random.randint(low=0, high=6, size=(50))
df = pd.DataFrame(x, columns=["A"])
df.A.value_counts()
In [ ]:
df.A.value_counts().plot.pie(y="A")
In [ ]:
x1 = np.random.normal(size=(10000))
x2 = np.random.normal(loc=3, scale=2, size=(10000))
X = np.array([x1, x2]).T
df = pd.DataFrame(X, columns=[r'$\mathcal{N}(0,1)$', r'$\mathcal{N}(3,2)$'])
df.plot.scatter(x=r'$\mathcal{N}(0,1)$',
y=r'$\mathcal{N}(3,2)$',
alpha=0.2)