In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [7]:
df1 = pd.read_csv("../data/iqsize.csv")
# we can apply head method, it will return the n first rows
# n = 5 as a default value
df1.head(10)
Out[7]:
In [8]:
print("Columns: {}".format(df1.columns))
print("Rows: {}".format(df1.index))
One common property of the dataframe to check is the size (in terms of number of rows and columns)
In [8]:
df1.shape
Out[8]:
Series are iterable, so we can iterate through df.columns and then check the dtype of each series
In [9]:
for column in df1.columns:
print("column name: {} - dtype: {}".format(column, df1[column].dtype))
This is a correct way of checking variable names, note:
We can access directly DataFrame dtypes this way
In [17]:
df1.dtypes
Out[17]:
The result is a series, so we can still do transformations on the result
In [18]:
type(df1.dtypes)
Out[18]:
We can use df1.dtypes to select only integer columns
In [24]:
df1[df1.dtypes[df1.dtypes == np.int64].index].head()
Out[24]:
What happened here?
From inner to outer operations:
df1.dtypes. This operation returns a Boolean Series object
In [27]:
result1 = df1.dtypes == np.int64
display(result1)
In [28]:
type(result1)
Out[28]:
result1 series. Note that the result is a Series containing the values of the dtypes of the original dataframe
In [32]:
df1.dtypes[result1]
Out[32]:
If we want to select the columns instead of the values, we need the indexes
In [36]:
result2 = df1.dtypes[result1].index
display(result2)
We can get the same result using result1 to slice df1.columns
In [35]:
df1.columns[result1]
Out[35]:
result2
In [38]:
df1[result2].head()
Out[38]:
Think if they have the proper variable type
In [39]:
df1.dtypes
Out[39]:
Use DataFrame.loc and DataFrame.iloc to (in each case print the result and check what data type you obtain as reponse):
Using iloc:
In [48]:
df1.iloc[:,1].head()
Out[48]:
We can also use loc
In [42]:
df1.columns
Out[42]:
In [43]:
df1.columns[1]
Out[43]:
In [47]:
df1.loc[:,'piq'].head()
Out[47]:
In [45]:
df1.loc[:,df1.columns[1]].head()
Out[45]:
In [51]:
print(type(df1.loc[:,df1.columns[1]].head()))
B. Get the third row
Using iloc
In [49]:
df1.iloc[3,:]
Out[49]:
Using loc
In [50]:
df1.loc[3,:]
Out[50]:
In [53]:
print(type(df1.loc[3,:]))
C. Get all but last column
In [54]:
df1.columns
Out[54]:
In [56]:
df1.columns[:-1]
Out[56]:
In [57]:
df1.loc[:,df1.columns[:-1]].head()
Out[57]:
In [58]:
print(type(df1.loc[:,df1.columns[:-1]].head()))
D. Get rows from 4 to 10
In [60]:
df1.iloc[4:11,:]
Out[60]:
In [61]:
print(type(df1.iloc[4:11,:]))
E. Get values from columns 2 and 3 containing 3 and 4 row values
Using iloc
In [62]:
df1.iloc[2:4,3:5]
Out[62]:
In [10]:
print(type(df1.iloc[2:4,3:5]))
Using loc
In [63]:
df1.columns
Out[63]:
In [65]:
df1.columns[3:5]
Out[65]:
In [67]:
df1.loc[2:3,["height","weight"]]
Out[67]:
In [73]:
df1.loc[2:3,df1.columns[3:5]]
Out[73]:
F. Get all iq values grater than 100
In [83]:
df1.loc[df1.loc[:,"weight"] > 136,"weight"].head()
Out[83]:
G. Divide previous results by 100
In [84]:
(df1.loc[df1.loc[:,"weight"] > 136,"weight"] / 100).head()
Out[84]:
In [85]:
df1.describe()
Out[85]:
In [92]:
df1["brain"].max()
Out[92]:
In [93]:
df1["brain"].mean()
Out[93]:
In [94]:
df1["brain"].std()
Out[94]:
In [90]:
s = df1["sex"]
In [91]:
s.value_counts()
Out[91]: