In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/src/sample.csv')
print(df)


   11  12  13  14
0  21  22  23  24
1  31  32  33  34

In [3]:
print(df.columns)


Index(['11', '12', '13', '14'], dtype='object')

In [4]:
df_none = pd.read_csv('data/src/sample.csv', header=None)
print(df_none)


    0   1   2   3
0  11  12  13  14
1  21  22  23  24
2  31  32  33  34

In [5]:
df_names = pd.read_csv('data/src/sample.csv', names=('A', 'B', 'C', 'D'))
print(df_names)


    A   B   C   D
0  11  12  13  14
1  21  22  23  24
2  31  32  33  34

In [6]:
df_header = pd.read_csv('data/src/sample_header.csv')
print(df_header)


    a   b   c   d
0  11  12  13  14
1  21  22  23  24
2  31  32  33  34

In [7]:
df_header_0 = pd.read_csv('data/src/sample_header.csv', header=0)
print(df_header_0)


    a   b   c   d
0  11  12  13  14
1  21  22  23  24
2  31  32  33  34

In [8]:
df_header_2 = pd.read_csv('data/src/sample_header.csv', header=2)
print(df_header_2)


   21  22  23  24
0  31  32  33  34

In [9]:
df_header_index = pd.read_csv('data/src/sample_header_index.csv')
print(df_header_index)


  Unnamed: 0   a   b   c   d
0        ONE  11  12  13  14
1        TWO  21  22  23  24
2      THREE  31  32  33  34

In [10]:
print(df_header_index.index)


RangeIndex(start=0, stop=3, step=1)

In [11]:
df_header_index_col = pd.read_csv('data/src/sample_header_index.csv', index_col=0)
print(df_header_index_col)


        a   b   c   d
ONE    11  12  13  14
TWO    21  22  23  24
THREE  31  32  33  34

In [12]:
print(df_header_index_col.index)


Index(['ONE', 'TWO', 'THREE'], dtype='object')

In [13]:
df_none_usecols = pd.read_csv('data/src/sample.csv', header=None, usecols=[1, 3])
print(df_none_usecols)


    1   3
0  12  14
1  22  24
2  32  34

In [14]:
df_none_usecols = pd.read_csv('data/src/sample.csv', header=None, usecols=[2])
print(df_none_usecols)


    2
0  13
1  23
2  33

In [15]:
df_header_usecols = pd.read_csv('data/src/sample_header.csv', usecols=['a', 'c'])
print(df_header_usecols)


    a   c
0  11  13
1  21  23
2  31  33

In [16]:
df_header_usecols = pd.read_csv('data/src/sample_header.csv',
                                usecols=lambda x: x is not 'b')
print(df_header_usecols)


    a   c   d
0  11  13  14
1  21  23  24
2  31  33  34

In [17]:
df_header_usecols = pd.read_csv('data/src/sample_header.csv',
                                usecols=lambda x: x not in ['a', 'c'])
print(df_header_usecols)


    b   d
0  12  14
1  22  24
2  32  34

In [18]:
df_index_usecols = pd.read_csv('data/src/sample_header_index.csv',
                               index_col=0, usecols=[0, 1, 3])
print(df_index_usecols)


        a   c
ONE    11  13
TWO    21  23
THREE  31  33

In [19]:
df_none = pd.read_csv('data/src/sample.csv', header=None)
print(df_none)


    0   1   2   3
0  11  12  13  14
1  21  22  23  24
2  31  32  33  34

In [20]:
df_none = pd.read_csv('data/src/sample.csv', header=None, skiprows=2)
print(df_none)


    0   1   2   3
0  31  32  33  34

In [21]:
df_none_skiprows = pd.read_csv('data/src/sample.csv', header=None, skiprows=[0, 2])
print(df_none_skiprows)


    0   1   2   3
0  21  22  23  24

In [22]:
df_none_skiprows = pd.read_csv('data/src/sample.csv', header=None, skiprows=[1])
print(df_none_skiprows)


    0   1   2   3
0  11  12  13  14
1  31  32  33  34

In [23]:
df_none_skiprows = pd.read_csv('data/src/sample.csv', header=None,
                               skiprows=lambda x: x not in [0, 2])
print(df_none_skiprows)


    0   1   2   3
0  11  12  13  14
1  31  32  33  34

In [24]:
df_header_skiprows = pd.read_csv('data/src/sample_header.csv', skiprows=[1])
print(df_header_skiprows)


    a   b   c   d
0  21  22  23  24
1  31  32  33  34

In [25]:
df_header_skiprows = pd.read_csv('data/src/sample_header.csv', skiprows=[0, 3])
print(df_header_skiprows)


   11  12  13  14
0  21  22  23  24

In [26]:
df_none_skipfooter = pd.read_csv('data/src/sample.csv', header=None,
                                 skipfooter=1, engine='python')
print(df_none_skipfooter)


    0   1   2   3
0  11  12  13  14
1  21  22  23  24

In [27]:
df_none_nrows = pd.read_csv('data/src/sample.csv', header=None, nrows=2)
print(df_none_nrows)


    0   1   2   3
0  11  12  13  14
1  21  22  23  24

In [28]:
df_default = pd.read_csv('data/src/sample_header_index_dtype.csv', index_col=0)
print(df_default)


       a    b      c  d
ONE    1    1  100.0  x
TWO    2   20    NaN  y
THREE  3  300  300.0  z

In [29]:
print(df_default.dtypes)


a      int64
b      int64
c    float64
d     object
dtype: object

In [30]:
print(df_default.applymap(type))


                   a              b                c              d
ONE    <class 'int'>  <class 'int'>  <class 'float'>  <class 'str'>
TWO    <class 'int'>  <class 'int'>  <class 'float'>  <class 'str'>
THREE  <class 'int'>  <class 'int'>  <class 'float'>  <class 'str'>

In [31]:
df_str = pd.read_csv('data/src/sample_header_index_dtype.csv',
                     index_col=0, dtype=str)
print(df_str)


       a    b    c  d
ONE    1  001  100  x
TWO    2  020  NaN  y
THREE  3  300  300  z

In [32]:
print(df_str.dtypes)


a    object
b    object
c    object
d    object
dtype: object

In [33]:
print(df_str.applymap(type))


                   a              b                c              d
ONE    <class 'str'>  <class 'str'>    <class 'str'>  <class 'str'>
TWO    <class 'str'>  <class 'str'>  <class 'float'>  <class 'str'>
THREE  <class 'str'>  <class 'str'>    <class 'str'>  <class 'str'>

In [34]:
df_object = pd.read_csv('data/src/sample_header_index_dtype.csv',
                        index_col=0, dtype=object)
print(df_object)


       a    b    c  d
ONE    1  001  100  x
TWO    2  020  NaN  y
THREE  3  300  300  z

In [35]:
print(df_object.dtypes)


a    object
b    object
c    object
d    object
dtype: object

In [36]:
print(df_object.applymap(type))


                   a              b                c              d
ONE    <class 'str'>  <class 'str'>    <class 'str'>  <class 'str'>
TWO    <class 'str'>  <class 'str'>  <class 'float'>  <class 'str'>
THREE  <class 'str'>  <class 'str'>    <class 'str'>  <class 'str'>

In [37]:
# df_int = pd.read_csv('data/src/sample_header_index_dtype.csv',
#                      index_col=0, dtype=int)
# ValueError: invalid literal for int() with base 10: 'ONE'

In [38]:
df_str_cast = df_str.astype({'a': int})
print(df_str_cast)


       a    b    c  d
ONE    1  001  100  x
TWO    2  020  NaN  y
THREE  3  300  300  z

In [39]:
print(df_str_cast.dtypes)


a     int64
b    object
c    object
d    object
dtype: object

In [40]:
df_str_col = pd.read_csv('data/src/sample_header_index_dtype.csv',
                         index_col=0, dtype={'b': str, 'c': str})
print(df_str_col)


       a    b    c  d
ONE    1  001  100  x
TWO    2  020  NaN  y
THREE  3  300  300  z

In [41]:
print(df_str_col.dtypes)


a     int64
b    object
c    object
d    object
dtype: object

In [42]:
df_str_col_num = pd.read_csv('data/src/sample_header_index_dtype.csv',
                             index_col=0, dtype={2: str, 3: str})
print(df_str_col_num)


       a    b    c  d
ONE    1  001  100  x
TWO    2  020  NaN  y
THREE  3  300  300  z

In [43]:
print(df_str_col_num.dtypes)


a     int64
b    object
c    object
d    object
dtype: object

In [44]:
df_nan = pd.read_csv('data/src/sample_header_index_nan.csv', index_col=0)
print(df_nan)


         a   b
ONE    NaN NaN
TWO      - NaN
THREE  NaN NaN

In [45]:
print(df_nan.isnull())


           a     b
ONE     True  True
TWO    False  True
THREE   True  True

In [46]:
df_nan_set_na = pd.read_csv('data/src/sample_header_index_nan.csv',
                            index_col=0, na_values='-')
print(df_nan_set_na)


        a   b
ONE   NaN NaN
TWO   NaN NaN
THREE NaN NaN

In [47]:
print(df_nan_set_na.isnull())


          a     b
ONE    True  True
TWO    True  True
THREE  True  True

In [48]:
df_nan_set_na_no_keep = pd.read_csv('data/src/sample_header_index_nan.csv',
                                    index_col=0, na_values=['-', 'NaN', 'null'],
                                    keep_default_na=False)
print(df_nan_set_na_no_keep)


         a    b
ONE         NaN
TWO    NaN  nan
THREE  NaN  N/A

In [49]:
print(df_nan_set_na_no_keep.isnull())


           a      b
ONE    False   True
TWO     True  False
THREE   True  False

In [50]:
df_nan_no_filter = pd.read_csv('data/src/sample_header_index_nan.csv',
                               index_col=0, na_filter=False)
print(df_nan_no_filter)


          a    b
ONE          NaN
TWO       -  nan
THREE  null  N/A

In [51]:
print(df_nan_no_filter.isnull())


           a      b
ONE    False  False
TWO    False  False
THREE  False  False

In [52]:
df_tsv = pd.read_table('data/src/sample_header_index.tsv', index_col=0)
print(df_tsv)


        a   b   c   d
ONE    11  12  13  14
TWO    21  22  23  24
THREE  31  32  33  34

In [53]:
df_tsv_sep = pd.read_csv('data/src/sample_header_index.tsv', index_col=0, sep='\t')
print(df_tsv_sep)


        a   b   c   d
ONE    11  12  13  14
TWO    21  22  23  24
THREE  31  32  33  34