In [ ]:
%matplotlib inline
#%matplotlib notebook
from IPython.display import display
import matplotlib
matplotlib.rcParams['figure.figsize'] = (9, 9)
import pandas as pd
import numpy as np
In [ ]:
!head -n30 /Users/jdecock/Downloads/CA20170725_1744.CSV
In [ ]:
#df = pd.read_csv("/Users/jdecock/Downloads/CA20170725_1744.CSV")
df = pd.read_csv("/Users/jdecock/Downloads/CA20170725_1744.CSV",
sep=';',
index_col=0,
usecols=range(4), # the last column is empty...
skiprows=9,
parse_dates=False,
infer_datetime_format=False,
keep_date_col=False,
date_parser=None,
dayfirst=False,
thousands=None,
decimal=',',
escapechar=None,
encoding='iso-8859-1')
df
In [ ]:
df.columns
In [ ]:
df['Débit Euros'].plot()
In [ ]:
In [ ]:
Reader functions are accessibles from the top level pd
object.
Writer functions are accessibles from data objects (i.e. Series
, DataFrame
or Panel
objects).
In [ ]:
data_array = np.array([[1, 2, 3], [4, 5, 6]])
df = pd.DataFrame(data_array, index=[10, 20], columns=[100, 200, 300])
df
Simplest version:
In [ ]:
df.to_csv(path_or_buf="python_pandas_io_test.csv")
In [ ]:
!cat python_pandas_io_test.csv
Setting more options:
In [ ]:
# FYI, many other options are available
df.to_csv(path_or_buf="python_pandas_io_test.csv",
sep=',',
columns=None,
header=True,
index=True,
index_label=None,
compression=None, # allowed values are 'gzip', 'bz2' or 'xz'
date_format=None)
In [ ]:
!cat python_pandas_io_test.csv
Simplest version:
In [ ]:
df = pd.read_csv("python_pandas_io_test.csv")
df
Setting more options:
In [ ]:
df = pd.read_csv("python_pandas_io_test.csv",
sep=',',
delimiter=None,
header='infer',
names=None,
index_col=0,
usecols=None,
squeeze=False,
prefix=None,
mangle_dupe_cols=True,
dtype=None,
engine=None,
converters=None,
true_values=None,
false_values=None,
skipinitialspace=False,
skiprows=None,
nrows=None,
na_values=None,
keep_default_na=True,
na_filter=True,
verbose=False,
skip_blank_lines=True,
parse_dates=False,
infer_datetime_format=False,
keep_date_col=False,
date_parser=None,
dayfirst=False,
iterator=False,
chunksize=None,
compression='infer',
thousands=None,
decimal=b'.',
lineterminator=None,
quotechar='"',
quoting=0,
escapechar=None,
comment=None,
encoding=None,
dialect=None,
tupleize_cols=False,
error_bad_lines=True,
warn_bad_lines=True,
skipfooter=0,
skip_footer=0,
doublequote=True,
delim_whitespace=False,
as_recarray=False,
compact_ints=False,
use_unsigned=False,
low_memory=True,
buffer_lines=None,
memory_map=False,
float_precision=None)
df
In [ ]:
!rm python_pandas_io_test.csv
In [ ]:
import io
In [ ]:
df.to_json(path_or_buf="python_pandas_io_test.json")
In [ ]:
!cat python_pandas_io_test.json
In [ ]:
df.to_json(path_or_buf="python_pandas_io_test_split.json",
orient="split")
In [ ]:
!cat python_pandas_io_test_split.json
In [ ]:
df.to_json(path_or_buf="python_pandas_io_test_records.json",
orient="records")
In [ ]:
!cat python_pandas_io_test_records.json
In [ ]:
df.to_json(path_or_buf="python_pandas_io_test_index.json",
orient="index")
In [ ]:
!cat python_pandas_io_test_index.json
In [ ]:
df.to_json(path_or_buf="python_pandas_io_test_columns.json",
orient="columns")
In [ ]:
!cat python_pandas_io_test_columns.json
In [ ]:
df.to_json(path_or_buf="python_pandas_io_test_values.json",
orient="values")
In [ ]:
!cat python_pandas_io_test_values.json
In [ ]:
# FYI, many other options are available
df.to_json(path_or_buf="python_pandas_io_test.json",
orient='columns', # For DataFrame: 'split','records','index','columns' or 'values'
date_format=None, # None, 'epoch' or 'iso'
double_precision=10,
force_ascii=True,
date_unit='ms')
In [ ]:
!cat python_pandas_io_test.json
Dict like data {index -> [index], columns -> [columns], data -> [values]}
In [ ]:
!cat python_pandas_io_test_split.json
In [ ]:
df = pd.read_json("python_pandas_io_test_split.json",
orient="split")
df
List like [{column -> value}, ... , {column -> value}]
In [ ]:
!cat python_pandas_io_test_records.json
In [ ]:
df = pd.read_json("python_pandas_io_test_records.json",
orient="records")
df
Dict like {index -> {column -> value}}
In [ ]:
!cat python_pandas_io_test_index.json
In [ ]:
df = pd.read_json("python_pandas_io_test_index.json",
orient="index")
df
Dict like {column -> {index -> value}}
In [ ]:
!cat python_pandas_io_test_columns.json
In [ ]:
df = pd.read_json("python_pandas_io_test_columns.json",
orient="columns")
df
Just the values array
In [ ]:
!cat python_pandas_io_test_values.json
In [ ]:
df = pd.read_json("python_pandas_io_test_values.json",
orient="values")
df
In [ ]:
df = pd.read_json("python_pandas_io_test.json",
orient=None,
typ='frame',
dtype=True,
convert_axes=True,
convert_dates=True,
keep_default_dates=True,
numpy=False,
precise_float=False,
date_unit=None,
encoding=None,
lines=False)
df
In [ ]:
!rm python_pandas_io_test*.json
Many other file formats can be used to import or export data with JSON.
See the following link for more information: http://pandas.pydata.org/pandas-docs/stable/io.html
In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
index=np.arange(1, 10, 1),
columns=['A', 'B', 'C'])
df
In [ ]:
df.B
In [ ]:
df["B"]
In [ ]:
df.loc[:,"B"]
In [ ]:
df.loc[:,['A','B']]
In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
index=np.arange(1, 10, 1),
columns=['A', 'B', 'C'])
df
In [ ]:
df.B < 50.
In [ ]:
df[df.B < 50.]
In [ ]:
df.iloc[:5]
In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
index=np.arange(1, 10, 1),
columns=['A', 'B', 'C'])
df
In [ ]:
df[df.B < 50][df.A >= 2].loc[:,['A','B']]
In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
index=np.arange(1, 10, 1),
columns=['A', 'B', 'C'])
df
In [ ]:
df.B *= 2.
df
In [ ]:
df.B = pow(df.B, 2)
df
In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
index=np.arange(1, 10, 1),
columns=['A', 'B', 'C'])
df
In [ ]:
df[df.B < 50.] *= -1.
df
In [ ]:
df[df.B < 50.] = pow(df[df.B < 50.], 2)
df
In [ ]:
a1 = np.array([np.arange(1, 5, 1), np.arange(10, 50, 10), np.arange(100, 500, 100)]).T
df1 = pd.DataFrame(a1,
columns=['ID', 'B', 'C'])
a2 = np.array([np.arange(1, 5, 1), np.arange(1000, 5000, 1000), np.arange(10000, 50000, 10000)]).T
df2 = pd.DataFrame(a2,
columns=['ID', 'B', 'C'])
display(df1)
display(df2)
df = pd.merge(df1, df2, on="ID", suffixes=('_1', '_2')) #.dropna(how='any')
display(df)
In [ ]:
a1 = np.array([np.arange(1, 5, 1), np.arange(10, 50, 10), np.arange(100, 500, 100)]).T
df1 = pd.DataFrame(a1,
columns=['ID', 'B', 'C'])
a2 = np.array([np.arange(1, 5, 1), np.arange(1000, 5000, 1000), np.arange(10000, 50000, 10000)]).T
df2 = pd.DataFrame(a2,
columns=['ID', 'B', 'C'])
df1.iloc[0,2] = np.nan
df1.iloc[1,1] = np.nan
df1.iloc[2,2] = np.nan
df1.iloc[3,1] = np.nan
df2.iloc[0,1] = np.nan
df2.iloc[1,2] = np.nan
df2.iloc[2,1] = np.nan
df2.iloc[3,2] = np.nan
df = pd.merge(df1, df2, on="ID", suffixes=('_1', '_2')) #.dropna(how='any')
display(df1)
display(df2)
display(df)
In [ ]:
a1 = np.array([np.arange(1, 5, 1), np.arange(10, 50, 10), np.arange(100, 500, 100)]).T
df1 = pd.DataFrame(a1,
columns=['ID', 'B', 'C'])
a2 = np.array([np.arange(1, 3, 1), np.arange(1000, 3000, 1000), np.arange(10000, 30000, 10000)]).T
df2 = pd.DataFrame(a2,
columns=['ID', 'B', 'C'])
display(df1)
display(df2)
print("Left: use only keys from left frame (SQL: left outer join)")
df = pd.merge(df1, df2, on="ID", how="left", suffixes=('_1', '_2')) #.dropna(how='any')
display(df)
print("Right: use only keys from right frame (SQL: right outer join)")
df = pd.merge(df1, df2, on="ID", how="right", suffixes=('_1', '_2')) #.dropna(how='any')
display(df)
print("Inner: use intersection of keys from both frames (SQL: inner join) [DEFAULT]")
df = pd.merge(df1, df2, on="ID", how="inner", suffixes=('_1', '_2')) #.dropna(how='any')
display(df)
print("Outer: use union of keys from both frames (SQL: full outer join)")
df = pd.merge(df1, df2, on="ID", how="outer", suffixes=('_1', '_2')) #.dropna(how='any')
display(df)
In [ ]:
a = np.array([[3, 5, 5, 5, 7, 7, 7, 7],
[2, 4, 4, 3, 1, 3, 3, 2],
[3, 4, 5, 6, 1, 8, 9, 8]]).T
df = pd.DataFrame(a,
columns=['A', 'B', 'C'])
df
In [ ]:
df.groupby(["A"]).count()
In [ ]:
df.groupby(["A"]).sum().B
In [ ]:
df.groupby(["A"]).mean().B
In [ ]:
df.groupby(["A","B"]).count()
In [ ]:
df.A.value_counts()
In [ ]:
df.A.value_counts().plot.bar()
In [ ]:
a = np.array([[3, np.nan, 5, np.nan, 7, 7, 7, 7],
[2, 4, 4, 3, 1, 3, 3, 2],
[3, 4, 5, 6, 1, 8, 9, 8]]).T
df = pd.DataFrame(a,
columns=['A', 'B', 'C'])
df
In [ ]:
df.A.isnull().sum()
In [ ]:
#help(df.plot)
In [ ]:
x = np.arange(0, 6, 0.1)
y1 = np.cos(x)
y2 = np.sin(x)
Y = np.array([y1, y2]).T
df = pd.DataFrame(Y,
columns=['cos(x)', 'sin(x)'],
index=x)
df.iloc[:10]
In [ ]:
df.plot(legend=True)
or
In [ ]:
df.plot.line(legend=True)
In [ ]:
x = np.arange(0, 6, 0.5)
y1 = np.cos(x)
y2 = np.sin(x)
Y = np.array([y1, y2]).T
df = pd.DataFrame(Y,
columns=['cos(x)', 'sin(x)'],
index=x)
df
In [ ]:
df.plot.bar(legend=True)
In [ ]:
df.plot.bar(legend=True, stacked=True)
In [ ]:
df.plot.barh(legend=True)
In [ ]:
x1 = np.random.normal(size=(10000))
x2 = np.random.normal(loc=3, scale=2, size=(10000))
X = np.array([x1, x2]).T
df = pd.DataFrame(X, columns=[r'$\mathcal{N}(0,1)$', r'$\mathcal{N}(3,2)$'])
df.plot.hist(alpha=0.2, bins=100, legend=True)
In [ ]:
x1 = np.random.normal(size=(10000))
x2 = np.random.normal(loc=3, scale=2, size=(10000))
X = np.array([x1, x2]).T
df = pd.DataFrame(X, columns=[r'$\mathcal{N}(0,1)$', r'$\mathcal{N}(3,2)$'])
df.plot.box()
In [ ]:
df = pd.DataFrame(np.random.randn(1000, 2), columns=['a', 'b'])
df['b'] = df['b'] + np.arange(1000)
df.plot.hexbin(x='a', y='b', gridsize=25)
In [ ]:
x1 = np.random.normal(size=(10000))
x2 = np.random.normal(loc=3, scale=2, size=(10000))
X = np.array([x1, x2]).T
df = pd.DataFrame(X, columns=[r'$\mathcal{N}(0,1)$', r'$\mathcal{N}(3,2)$'])
df.plot.kde()
In [ ]:
df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])
df.plot.area()
In [ ]:
x = np.random.randint(low=0, high=6, size=(50))
df = pd.DataFrame(x, columns=["A"])
df.A.value_counts()
In [ ]:
df.A.value_counts().plot.pie(y="A")
In [ ]:
x1 = np.random.normal(size=(10000))
x2 = np.random.normal(loc=3, scale=2, size=(10000))
X = np.array([x1, x2]).T
df = pd.DataFrame(X, columns=[r'$\mathcal{N}(0,1)$', r'$\mathcal{N}(3,2)$'])
df.plot.scatter(x=r'$\mathcal{N}(0,1)$',
y=r'$\mathcal{N}(3,2)$',
alpha=0.2)