Read CA CSV

Import directives


In [ ]:
%matplotlib inline
#%matplotlib notebook

from IPython.display import display

import matplotlib
matplotlib.rcParams['figure.figsize'] = (9, 9)

import pandas as pd
import numpy as np

In [ ]:
!head -n30 /Users/jdecock/Downloads/CA20170725_1744.CSV

In [ ]:
#df = pd.read_csv("/Users/jdecock/Downloads/CA20170725_1744.CSV")

df = pd.read_csv("/Users/jdecock/Downloads/CA20170725_1744.CSV",
                 sep=';',
                 index_col=0,
                 usecols=range(4), # the last column is empty...
                 skiprows=9,
                 parse_dates=False,
                 infer_datetime_format=False,
                 keep_date_col=False,
                 date_parser=None,
                 dayfirst=False,
                 thousands=None,
                 decimal=',',
                 escapechar=None,
                 encoding='iso-8859-1')

df

In [ ]:
df.columns

In [ ]:
df['Débit Euros'].plot()

In [ ]:


In [ ]:

Export/import data (write/read files)

Reader functions are accessibles from the top level pd object.

Writer functions are accessibles from data objects (i.e. Series, DataFrame or Panel objects).


In [ ]:
data_array = np.array([[1, 2, 3], [4, 5, 6]])
df = pd.DataFrame(data_array, index=[10, 20], columns=[100, 200, 300])
df

CSV files

Write CSV files

Simplest version:


In [ ]:
df.to_csv(path_or_buf="python_pandas_io_test.csv")

In [ ]:
!cat python_pandas_io_test.csv

Setting more options:


In [ ]:
# FYI, many other options are available
df.to_csv(path_or_buf="python_pandas_io_test.csv",
          sep=',',
          columns=None,
          header=True,
          index=True,
          index_label=None,
          compression=None,  # allowed values are 'gzip', 'bz2' or 'xz'
          date_format=None)

In [ ]:
!cat python_pandas_io_test.csv

Read CSV files

Simplest version:


In [ ]:
df = pd.read_csv("python_pandas_io_test.csv")
df

Setting more options:


In [ ]:
df = pd.read_csv("python_pandas_io_test.csv",
                 sep=',',
                 delimiter=None,
                 header='infer',
                 names=None,
                 index_col=0,
                 usecols=None,
                 squeeze=False,
                 prefix=None,
                 mangle_dupe_cols=True,
                 dtype=None,
                 engine=None,
                 converters=None,
                 true_values=None,
                 false_values=None,
                 skipinitialspace=False,
                 skiprows=None,
                 nrows=None,
                 na_values=None,
                 keep_default_na=True,
                 na_filter=True,
                 verbose=False,
                 skip_blank_lines=True,
                 parse_dates=False,
                 infer_datetime_format=False,
                 keep_date_col=False,
                 date_parser=None,
                 dayfirst=False,
                 iterator=False,
                 chunksize=None,
                 compression='infer',
                 thousands=None,
                 decimal=b'.',
                 lineterminator=None,
                 quotechar='"',
                 quoting=0,
                 escapechar=None,
                 comment=None,
                 encoding=None,
                 dialect=None,
                 tupleize_cols=False,
                 error_bad_lines=True,
                 warn_bad_lines=True,
                 skipfooter=0,
                 skip_footer=0,
                 doublequote=True,
                 delim_whitespace=False,
                 as_recarray=False,
                 compact_ints=False,
                 use_unsigned=False,
                 low_memory=True,
                 buffer_lines=None,
                 memory_map=False,
                 float_precision=None)
df

In [ ]:
!rm python_pandas_io_test.csv

JSON files


In [ ]:
import io

Write JSON files

Simplest version

In [ ]:
df.to_json(path_or_buf="python_pandas_io_test.json")

In [ ]:
!cat python_pandas_io_test.json
Setting orient="split"

In [ ]:
df.to_json(path_or_buf="python_pandas_io_test_split.json",
           orient="split")

In [ ]:
!cat python_pandas_io_test_split.json
Setting orient="records"

In [ ]:
df.to_json(path_or_buf="python_pandas_io_test_records.json",
           orient="records")

In [ ]:
!cat python_pandas_io_test_records.json
Setting orient="index" (the default option for Series)

In [ ]:
df.to_json(path_or_buf="python_pandas_io_test_index.json",
           orient="index")

In [ ]:
!cat python_pandas_io_test_index.json
Setting orient="columns" (the default option for DataFrame) (for DataFrame only)

In [ ]:
df.to_json(path_or_buf="python_pandas_io_test_columns.json",
           orient="columns")

In [ ]:
!cat python_pandas_io_test_columns.json
Setting orient="values" (for DataFrame only)

In [ ]:
df.to_json(path_or_buf="python_pandas_io_test_values.json",
           orient="values")

In [ ]:
!cat python_pandas_io_test_values.json
Setting more options

In [ ]:
# FYI, many other options are available
df.to_json(path_or_buf="python_pandas_io_test.json",
           orient='columns',     # For DataFrame: 'split','records','index','columns' or 'values'
           date_format=None,     # None, 'epoch' or 'iso'
           double_precision=10,
           force_ascii=True,
           date_unit='ms')

In [ ]:
!cat python_pandas_io_test.json

Read JSON files

Using orient="split"

Dict like data {index -> [index], columns -> [columns], data -> [values]}


In [ ]:
!cat python_pandas_io_test_split.json

In [ ]:
df = pd.read_json("python_pandas_io_test_split.json",
                  orient="split")
df
Using orient="records"

List like [{column -> value}, ... , {column -> value}]


In [ ]:
!cat python_pandas_io_test_records.json

In [ ]:
df = pd.read_json("python_pandas_io_test_records.json",
                  orient="records")
df
Using orient="index"

Dict like {index -> {column -> value}}


In [ ]:
!cat python_pandas_io_test_index.json

In [ ]:
df = pd.read_json("python_pandas_io_test_index.json",
                  orient="index")
df
Using orient="columns"

Dict like {column -> {index -> value}}


In [ ]:
!cat python_pandas_io_test_columns.json

In [ ]:
df = pd.read_json("python_pandas_io_test_columns.json",
                  orient="columns")
df
Using orient="values" (for DataFrame only)

Just the values array


In [ ]:
!cat python_pandas_io_test_values.json

In [ ]:
df = pd.read_json("python_pandas_io_test_values.json",
                  orient="values")
df
Setting more options

In [ ]:
df = pd.read_json("python_pandas_io_test.json",
                  orient=None,
                  typ='frame',
                  dtype=True,
                  convert_axes=True,
                  convert_dates=True,
                  keep_default_dates=True,
                  numpy=False,
                  precise_float=False,
                  date_unit=None,
                  encoding=None,
                  lines=False)
df

In [ ]:
!rm python_pandas_io_test*.json

Other file formats

Many other file formats can be used to import or export data with JSON.

See the following link for more information: http://pandas.pydata.org/pandas-docs/stable/io.html

Select columns


In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
                  index=np.arange(1, 10, 1),
                  columns=['A', 'B', 'C'])
df

In [ ]:
df.B

In [ ]:
df["B"]

In [ ]:
df.loc[:,"B"]

In [ ]:
df.loc[:,['A','B']]

Select rows


In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
                  index=np.arange(1, 10, 1),
                  columns=['A', 'B', 'C'])
df

In [ ]:
df.B < 50.

In [ ]:
df[df.B < 50.]

Select over index: select the 5 first rows


In [ ]:
df.iloc[:5]

Select rows and columns


In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
                  index=np.arange(1, 10, 1),
                  columns=['A', 'B', 'C'])
df

In [ ]:
df[df.B < 50][df.A >= 2].loc[:,['A','B']]

Apply a function to selected colunms values


In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
                  index=np.arange(1, 10, 1),
                  columns=['A', 'B', 'C'])
df

In [ ]:
df.B *= 2.
df

In [ ]:
df.B = pow(df.B, 2)
df

Apply a function to selected rows values


In [ ]:
data_array = np.array([np.arange(1, 10, 1), np.arange(10, 100, 10), np.arange(100, 1000, 100)]).T
df = pd.DataFrame(data_array,
                  index=np.arange(1, 10, 1),
                  columns=['A', 'B', 'C'])
df

In [ ]:
df[df.B < 50.] *= -1.
df

In [ ]:
df[df.B < 50.] = pow(df[df.B < 50.], 2)
df

Merge


In [ ]:
a1 = np.array([np.arange(1, 5, 1), np.arange(10, 50, 10), np.arange(100, 500, 100)]).T
df1 = pd.DataFrame(a1,
                   columns=['ID', 'B', 'C'])

a2 = np.array([np.arange(1, 5, 1), np.arange(1000, 5000, 1000), np.arange(10000, 50000, 10000)]).T
df2 = pd.DataFrame(a2,
                   columns=['ID', 'B', 'C'])

display(df1)
display(df2)

df = pd.merge(df1, df2, on="ID", suffixes=('_1', '_2'))  #.dropna(how='any')

display(df)

Merge with NaN


In [ ]:
a1 = np.array([np.arange(1, 5, 1), np.arange(10, 50, 10), np.arange(100, 500, 100)]).T
df1 = pd.DataFrame(a1,
                   columns=['ID', 'B', 'C'])

a2 = np.array([np.arange(1, 5, 1), np.arange(1000, 5000, 1000), np.arange(10000, 50000, 10000)]).T
df2 = pd.DataFrame(a2,
                   columns=['ID', 'B', 'C'])

df1.iloc[0,2] = np.nan
df1.iloc[1,1] = np.nan
df1.iloc[2,2] = np.nan
df1.iloc[3,1] = np.nan

df2.iloc[0,1] = np.nan
df2.iloc[1,2] = np.nan
df2.iloc[2,1] = np.nan
df2.iloc[3,2] = np.nan

df = pd.merge(df1, df2, on="ID", suffixes=('_1', '_2'))  #.dropna(how='any')

display(df1)
display(df2)
display(df)

Merge with missing rows


In [ ]:
a1 = np.array([np.arange(1, 5, 1), np.arange(10, 50, 10), np.arange(100, 500, 100)]).T
df1 = pd.DataFrame(a1,
                   columns=['ID', 'B', 'C'])

a2 = np.array([np.arange(1, 3, 1), np.arange(1000, 3000, 1000), np.arange(10000, 30000, 10000)]).T
df2 = pd.DataFrame(a2,
                   columns=['ID', 'B', 'C'])

display(df1)
display(df2)

print("Left: use only keys from left frame (SQL: left outer join)")
df = pd.merge(df1, df2, on="ID", how="left", suffixes=('_1', '_2'))  #.dropna(how='any')
display(df)

print("Right: use only keys from right frame (SQL: right outer join)")
df = pd.merge(df1, df2, on="ID", how="right", suffixes=('_1', '_2'))  #.dropna(how='any')
display(df)

print("Inner: use intersection of keys from both frames (SQL: inner join) [DEFAULT]")
df = pd.merge(df1, df2, on="ID", how="inner", suffixes=('_1', '_2'))  #.dropna(how='any')
display(df)

print("Outer: use union of keys from both frames (SQL: full outer join)")
df = pd.merge(df1, df2, on="ID", how="outer", suffixes=('_1', '_2'))  #.dropna(how='any')
display(df)

GroupBy


In [ ]:
a = np.array([[3, 5, 5, 5, 7, 7, 7, 7],
              [2, 4, 4, 3, 1, 3, 3, 2],
              [3, 4, 5, 6, 1, 8, 9, 8]]).T
df = pd.DataFrame(a,
                  columns=['A', 'B', 'C'])

df

GroupBy with single key


In [ ]:
df.groupby(["A"]).count()

In [ ]:
df.groupby(["A"]).sum().B

In [ ]:
df.groupby(["A"]).mean().B

GroupBy with multiple keys


In [ ]:
df.groupby(["A","B"]).count()

Count the number of occurrences of a column value


In [ ]:
df.A.value_counts()

In [ ]:
df.A.value_counts().plot.bar()

Count the number of NaN values in a column


In [ ]:
a = np.array([[3, np.nan, 5, np.nan, 7, 7, 7, 7],
              [2, 4, 4, 3, 1, 3, 3, 2],
              [3, 4, 5, 6, 1, 8, 9, 8]]).T
df = pd.DataFrame(a,
                  columns=['A', 'B', 'C'])

df

In [ ]:
df.A.isnull().sum()

Plot


In [ ]:
#help(df.plot)

Line plot


In [ ]:
x = np.arange(0, 6, 0.1)
y1 = np.cos(x)
y2 = np.sin(x)
Y = np.array([y1, y2]).T

df = pd.DataFrame(Y,
                  columns=['cos(x)', 'sin(x)'],
                  index=x)
df.iloc[:10]

In [ ]:
df.plot(legend=True)

or


In [ ]:
df.plot.line(legend=True)

Bar plot


In [ ]:
x = np.arange(0, 6, 0.5)
y1 = np.cos(x)
y2 = np.sin(x)
Y = np.array([y1, y2]).T

df = pd.DataFrame(Y,
                  columns=['cos(x)', 'sin(x)'],
                  index=x)
df

Vertical


In [ ]:
df.plot.bar(legend=True)

In [ ]:
df.plot.bar(legend=True, stacked=True)

Horizontal


In [ ]:
df.plot.barh(legend=True)

Histogram


In [ ]:
x1 = np.random.normal(size=(10000))
x2 = np.random.normal(loc=3, scale=2, size=(10000))
X = np.array([x1, x2]).T

df = pd.DataFrame(X, columns=[r'$\mathcal{N}(0,1)$', r'$\mathcal{N}(3,2)$'])

df.plot.hist(alpha=0.2, bins=100, legend=True)

Box plot


In [ ]:
x1 = np.random.normal(size=(10000))
x2 = np.random.normal(loc=3, scale=2, size=(10000))
X = np.array([x1, x2]).T

df = pd.DataFrame(X, columns=[r'$\mathcal{N}(0,1)$', r'$\mathcal{N}(3,2)$'])

df.plot.box()

Hexbin plot


In [ ]:
df = pd.DataFrame(np.random.randn(1000, 2), columns=['a', 'b'])
df['b'] = df['b'] + np.arange(1000)
df.plot.hexbin(x='a', y='b', gridsize=25)

Kernel Density Estimation (KDE) plot


In [ ]:
x1 = np.random.normal(size=(10000))
x2 = np.random.normal(loc=3, scale=2, size=(10000))
X = np.array([x1, x2]).T

df = pd.DataFrame(X, columns=[r'$\mathcal{N}(0,1)$', r'$\mathcal{N}(3,2)$'])

df.plot.kde()

Area plot


In [ ]:
df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])

df.plot.area()

Pie chart


In [ ]:
x = np.random.randint(low=0, high=6, size=(50))

df = pd.DataFrame(x, columns=["A"])
df.A.value_counts()

In [ ]:
df.A.value_counts().plot.pie(y="A")

Scatter plot


In [ ]:
x1 = np.random.normal(size=(10000))
x2 = np.random.normal(loc=3, scale=2, size=(10000))
X = np.array([x1, x2]).T

df = pd.DataFrame(X, columns=[r'$\mathcal{N}(0,1)$', r'$\mathcal{N}(3,2)$'])

df.plot.scatter(x=r'$\mathcal{N}(0,1)$',
                y=r'$\mathcal{N}(3,2)$',
                alpha=0.2)