In [1]:
import pandas as pd #importing packages
import os as os

In [2]:
pd.describe_option() #describe options for customizing


display.chop_threshold : float or None
    if set to a float value, all float values smaller then the given threshold
    will be displayed as exactly 0 by repr and friends.
    [default: None] [currently: None]

display.colheader_justify : 'left'/'right'
    Controls the justification of column headers. used by DataFrameFormatter.
    [default: right] [currently: right]

display.column_space No description available.
    [default: 12] [currently: 12]

display.date_dayfirst : boolean
    When True, prints and parses dates with the day first, eg 20/01/2005
    [default: False] [currently: False]

display.date_yearfirst : boolean
    When True, prints and parses dates with the year first, eg 2005/01/20
    [default: False] [currently: False]

display.encoding : str/unicode
    Defaults to the detected encoding of the console.
    Specifies the encoding to be used for strings returned by to_string,
    these are generally strings meant to be displayed on the console.
    [default: UTF-8] [currently: UTF-8]

display.expand_frame_repr : boolean
    Whether to print out the full DataFrame repr for wide DataFrames across
    multiple lines, `max_columns` is still respected, but the output will
    wrap-around across multiple "pages" if its width exceeds `display.width`.
    [default: True] [currently: True]

display.float_format : callable
    The callable should accept a floating point number and return
    a string with the desired format of the number. This is used
    in some places like SeriesFormatter.
    See formats.format.EngFormatter for an example.
    [default: None] [currently: None]

display.height : int
    Deprecated.
    [default: 60] [currently: 60]
    (Deprecated, use `display.max_rows` instead.)

display.large_repr : 'truncate'/'info'
    For DataFrames exceeding max_rows/max_cols, the repr (and HTML repr) can
    show a truncated table (the default from 0.13), or switch to the view from
    df.info() (the behaviour in earlier versions of pandas).
    [default: truncate] [currently: truncate]

display.latex.escape : bool
    This specifies if the to_latex method of a Dataframe uses escapes special
    characters.
    method. Valid values: False,True
    [default: True] [currently: True]

display.latex.longtable :bool
    This specifies if the to_latex method of a Dataframe uses the longtable
    format.
    method. Valid values: False,True
    [default: False] [currently: False]

display.latex.repr : boolean
    Whether to produce a latex DataFrame representation for jupyter
    environments that support it.
    (default: False)
    [default: False] [currently: False]

display.line_width : int
    Deprecated.
    [default: 80] [currently: 80]
    (Deprecated, use `display.width` instead.)

display.max_categories : int
    This sets the maximum number of categories pandas should output when
    printing out a `Categorical` or a Series of dtype "category".
    [default: 8] [currently: 8]

display.max_columns : int
    If max_cols is exceeded, switch to truncate view. Depending on
    `large_repr`, objects are either centrally truncated or printed as
    a summary view. 'None' value means unlimited.

    In case python/IPython is running in a terminal and `large_repr`
    equals 'truncate' this can be set to 0 and pandas will auto-detect
    the width of the terminal and print a truncated object which fits
    the screen width. The IPython notebook, IPython qtconsole, or IDLE
    do not run in a terminal and hence it is not possible to do
    correct auto-detection.
    [default: 20] [currently: 20]

display.max_colwidth : int
    The maximum width in characters of a column in the repr of
    a pandas data structure. When the column overflows, a "..."
    placeholder is embedded in the output.
    [default: 50] [currently: 50]

display.max_info_columns : int
    max_info_columns is used in DataFrame.info method to decide if
    per column information will be printed.
    [default: 100] [currently: 100]

display.max_info_rows : int or None
    df.info() will usually show null-counts for each column.
    For large frames this can be quite slow. max_info_rows and max_info_cols
    limit this null check only to frames with smaller dimensions than
    specified.
    [default: 1690785] [currently: 1690785]

display.max_rows : int
    If max_rows is exceeded, switch to truncate view. Depending on
    `large_repr`, objects are either centrally truncated or printed as
    a summary view. 'None' value means unlimited.

    In case python/IPython is running in a terminal and `large_repr`
    equals 'truncate' this can be set to 0 and pandas will auto-detect
    the height of the terminal and print a truncated object which fits
    the screen height. The IPython notebook, IPython qtconsole, or
    IDLE do not run in a terminal and hence it is not possible to do
    correct auto-detection.
    [default: 60] [currently: 60]

display.max_seq_items : int or None
    when pretty-printing a long sequence, no more then `max_seq_items`
    will be printed. If items are omitted, they will be denoted by the
    addition of "..." to the resulting string.

    If set to None, the number of items to be printed is unlimited.
    [default: 100] [currently: 100]

display.memory_usage : bool, string or None
    This specifies if the memory usage of a DataFrame should be displayed when
    df.info() is called. Valid values True,False,'deep'
    [default: True] [currently: True]

display.mpl_style : bool
    Setting this to 'default' will modify the rcParams used by matplotlib
    to give plots a more pleasing visual style by default.
    Setting this to None/False restores the values to their initial value.
    [default: None] [currently: None]

display.multi_sparse : boolean
    "sparsify" MultiIndex display (don't display repeated
    elements in outer levels within groups)
    [default: True] [currently: True]

display.notebook_repr_html : boolean
    When True, IPython notebook will use html representation for
    pandas objects (if it is available).
    [default: True] [currently: True]

display.pprint_nest_depth : int
    Controls the number of nested levels to process when pretty-printing
    [default: 3] [currently: 3]

display.precision : int
    Floating point output precision (number of significant digits). This is
    only a suggestion
    [default: 6] [currently: 6]

display.show_dimensions : boolean or 'truncate'
    Whether to print out dimensions at the end of DataFrame repr.
    If 'truncate' is specified, only print out the dimensions if the
    frame is truncated (e.g. not display all rows and/or columns)
    [default: truncate] [currently: truncate]

display.unicode.ambiguous_as_wide : boolean
    Whether to use the Unicode East Asian Width to calculate the display text
    width.
    Enabling this may affect to the performance (default: False)
    [default: False] [currently: False]

display.unicode.east_asian_width : boolean
    Whether to use the Unicode East Asian Width to calculate the display text
    width.
    Enabling this may affect to the performance (default: False)
    [default: False] [currently: False]

display.width : int
    Width of the display in characters. In case python/IPython is running in
    a terminal this can be set to None and pandas will correctly auto-detect
    the width.
    Note that the IPython notebook, IPython qtconsole, or IDLE do not run in a
    terminal and hence it is not possible to correctly detect the width.
    [default: 80] [currently: 80]

io.excel.xls.writer : string
    The default Excel writer engine for 'xls' files. Available options:
    'xlwt' (the default).
    [default: xlwt] [currently: xlwt]

io.excel.xlsm.writer : string
    The default Excel writer engine for 'xlsm' files. Available options:
    'openpyxl' (the default).
    [default: openpyxl] [currently: openpyxl]

io.excel.xlsx.writer : string
    The default Excel writer engine for 'xlsx' files. Available options:
    'xlsxwriter' (the default), 'openpyxl'.
    [default: xlsxwriter] [currently: xlsxwriter]

io.hdf.default_format : format
    default format writing format, if None, then
    put will default to 'fixed' and append will default to 'table'
    [default: None] [currently: None]

io.hdf.dropna_table : boolean
    drop ALL nan rows when appending to a table
    [default: False] [currently: False]

mode.chained_assignment : string
    Raise an exception, warn, or no action if trying to use chained assignment,
    The default is warn
    [default: warn] [currently: warn]

mode.sim_interactive : boolean
    Whether to simulate interactive mode for purposes of testing
    [default: False] [currently: False]

mode.use_inf_as_null : boolean
    True means treat None, NaN, INF, -INF as null (old way),
    False means None and NaN are null, but INF, -INF are not null
    (new way).
    [default: False] [currently: False]



In [3]:
pd.get_option("display.memory_usage")#setting some options


Out[3]:
True

In [5]:
adult=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",header=None)

In [6]:
adult.head()


Out[6]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K

In [7]:
names2=["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","income"]

In [8]:
adult.columns=names2

In [9]:
adult.head()


Out[9]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country income
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K

In [10]:
adult.describe()


Out[10]:
age fnlwgt education-num capital-gain capital-loss hours-per-week
count 32561.000000 3.256100e+04 32561.000000 32561.000000 32561.000000 32561.000000
mean 38.581647 1.897784e+05 10.080679 1077.648844 87.303830 40.437456
std 13.640433 1.055500e+05 2.572720 7385.292085 402.960219 12.347429
min 17.000000 1.228500e+04 1.000000 0.000000 0.000000 1.000000
25% 28.000000 1.178270e+05 9.000000 0.000000 0.000000 40.000000
50% 37.000000 1.783560e+05 10.000000 0.000000 0.000000 40.000000
75% 48.000000 2.370510e+05 12.000000 0.000000 0.000000 45.000000
max 90.000000 1.484705e+06 16.000000 99999.000000 4356.000000 99.000000

In [22]:
adult.quantile([.1,.5])


Out[22]:
age fnlwgt education-num capital-gain capital-loss hours-per-week
0.1 22.0 65716.0 7.0 0.0 0.0 24.0
0.5 37.0 178356.0 10.0 0.0 0.0 40.0

In [21]:
adult.quantile([.1,.5,.10,.25,.50,.75,.90,.95,.99])


Out[21]:
age fnlwgt education-num capital-gain capital-loss hours-per-week
0.10 22.0 65716.0 7.0 0.0 0.0 24.0
0.50 37.0 178356.0 10.0 0.0 0.0 40.0
0.10 22.0 65716.0 7.0 0.0 0.0 24.0
0.25 28.0 117827.0 9.0 0.0 0.0 40.0
0.50 37.0 178356.0 10.0 0.0 0.0 40.0
0.75 48.0 237051.0 12.0 0.0 0.0 45.0
0.90 58.0 329054.0 13.0 0.0 0.0 55.0
0.95 63.0 379682.0 14.0 5013.0 0.0 60.0
0.99 74.0 510072.0 16.0 15024.0 1980.0 80.0

In [ ]:


In [15]:
adult.corr()


Out[15]:
age fnlwgt education-num capital-gain capital-loss hours-per-week
age 1.000000 -0.076646 0.036527 0.077674 0.057775 0.068756
fnlwgt -0.076646 1.000000 -0.043195 0.000432 -0.010252 -0.018768
education-num 0.036527 -0.043195 1.000000 0.122630 0.079923 0.148123
capital-gain 0.077674 0.000432 0.122630 1.000000 -0.031615 0.078409
capital-loss 0.057775 -0.010252 0.079923 -0.031615 1.000000 0.054256
hours-per-week 0.068756 -0.018768 0.148123 0.078409 0.054256 1.000000

In [14]:
adult.corr(method='pearson', min_periods=1)


Out[14]:
age fnlwgt education-num capital-gain capital-loss hours-per-week
age 1.000000 -0.076646 0.036527 0.077674 0.057775 0.068756
fnlwgt -0.076646 1.000000 -0.043195 0.000432 -0.010252 -0.018768
education-num 0.036527 -0.043195 1.000000 0.122630 0.079923 0.148123
capital-gain 0.077674 0.000432 0.122630 1.000000 -0.031615 0.078409
capital-loss 0.057775 -0.010252 0.079923 -0.031615 1.000000 0.054256
hours-per-week 0.068756 -0.018768 0.148123 0.078409 0.054256 1.000000

In [12]:
adult.race.value_counts()


Out[12]:
 White                 27816
 Black                  3124
 Asian-Pac-Islander     1039
 Amer-Indian-Eskimo      311
 Other                   271
Name: race, dtype: int64

In [13]:
adult.sex.value_counts()


Out[13]:
 Male      21790
 Female    10771
Name: sex, dtype: int64

In [23]:
pd.crosstab(adult.race,adult.sex)


Out[23]:
sex Female Male
race
Amer-Indian-Eskimo 119 192
Asian-Pac-Islander 346 693
Black 1555 1569
Other 109 162
White 8642 19174

In [24]:
pd.crosstab(adult.race,adult.income)


Out[24]:
income <=50K >50K
race
Amer-Indian-Eskimo 275 36
Asian-Pac-Islander 763 276
Black 2737 387
Other 246 25
White 20699 7117

In [25]:
workclass=adult.groupby("workclass")

In [26]:
workclass.count()


Out[26]:
age fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country income
workclass
? 1836 1836 1836 1836 1836 1836 1836 1836 1836 1836 1836 1836 1836 1836
Federal-gov 960 960 960 960 960 960 960 960 960 960 960 960 960 960
Local-gov 2093 2093 2093 2093 2093 2093 2093 2093 2093 2093 2093 2093 2093 2093
Never-worked 7 7 7 7 7 7 7 7 7 7 7 7 7 7
Private 22696 22696 22696 22696 22696 22696 22696 22696 22696 22696 22696 22696 22696 22696
Self-emp-inc 1116 1116 1116 1116 1116 1116 1116 1116 1116 1116 1116 1116 1116 1116
Self-emp-not-inc 2541 2541 2541 2541 2541 2541 2541 2541 2541 2541 2541 2541 2541 2541
State-gov 1298 1298 1298 1298 1298 1298 1298 1298 1298 1298 1298 1298 1298 1298
Without-pay 14 14 14 14 14 14 14 14 14 14 14 14 14 14

In [27]:
workclass.mean()


Out[27]:
age fnlwgt education-num capital-gain capital-loss hours-per-week
workclass
? 40.960240 188516.338235 9.260349 606.795752 60.760349 31.919390
Federal-gov 42.590625 185221.243750 10.973958 833.232292 112.268750 41.379167
Local-gov 41.751075 188639.712852 11.042045 880.202580 109.854276 40.982800
Never-worked 20.571429 225989.571429 7.428571 0.000000 0.000000 28.428571
Private 36.797585 192764.114734 9.879714 889.217792 80.008724 40.267096
Self-emp-inc 46.017025 175981.344086 11.137097 4875.693548 155.138889 48.818100
Self-emp-not-inc 44.969697 175608.641480 10.226289 1886.061787 116.631641 44.421881
State-gov 39.436055 184136.613251 11.375963 701.699538 83.256549 39.031587
Without-pay 47.785714 174267.500000 9.071429 487.857143 0.000000 32.714286

In [28]:
adult.transpose()


Out[28]:
0 1 2 3 4 5 6 7 8 9 ... 32551 32552 32553 32554 32555 32556 32557 32558 32559 32560
age 39 50 38 53 28 37 49 52 31 42 ... 32 43 32 53 22 27 40 58 22 52
workclass State-gov Self-emp-not-inc Private Private Private Private Private Self-emp-not-inc Private Private ... Private Private Private Private Private Private Private Private Private Self-emp-inc
fnlwgt 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ... 34066 84661 116138 321865 310152 257302 154374 151910 201490 287927
education Bachelors Bachelors HS-grad 11th Bachelors Masters 9th HS-grad Masters Bachelors ... 10th Assoc-voc Masters Masters Some-college Assoc-acdm HS-grad HS-grad HS-grad HS-grad
education-num 13 13 9 7 13 14 5 9 14 13 ... 6 11 14 14 10 12 9 9 9 9
marital-status Never-married Married-civ-spouse Divorced Married-civ-spouse Married-civ-spouse Married-civ-spouse Married-spouse-absent Married-civ-spouse Never-married Married-civ-spouse ... Married-civ-spouse Married-civ-spouse Never-married Married-civ-spouse Never-married Married-civ-spouse Married-civ-spouse Widowed Never-married Married-civ-spouse
occupation Adm-clerical Exec-managerial Handlers-cleaners Handlers-cleaners Prof-specialty Exec-managerial Other-service Exec-managerial Prof-specialty Exec-managerial ... Handlers-cleaners Sales Tech-support Exec-managerial Protective-serv Tech-support Machine-op-inspct Adm-clerical Adm-clerical Exec-managerial
relationship Not-in-family Husband Not-in-family Husband Wife Wife Not-in-family Husband Not-in-family Husband ... Husband Husband Not-in-family Husband Not-in-family Wife Husband Unmarried Own-child Wife
race White White White Black Black White Black White White White ... Amer-Indian-Eskimo White Asian-Pac-Islander White White White White White White White
sex Male Male Male Male Female Female Female Male Female Male ... Male Male Male Male Male Female Male Female Male Female
capital-gain 2174 0 0 0 0 0 0 0 14084 5178 ... 0 0 0 0 0 0 0 0 0 15024
capital-loss 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
hours-per-week 40 13 40 40 40 40 16 45 50 40 ... 40 45 11 40 40 38 40 40 20 40
native-country United-States United-States United-States United-States Cuba United-States Jamaica United-States United-States United-States ... United-States United-States Taiwan United-States United-States United-States United-States United-States United-States United-States
income <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K >50K >50K ... <=50K <=50K <=50K >50K <=50K <=50K >50K <=50K <=50K >50K

15 rows × 32561 columns


In [29]:
e=adult.groupby(['sex', "race"]).age.median().reset_index()
e.pivot(index='sex', columns='race', values='age')


Out[29]:
race Amer-Indian-Eskimo Asian-Pac-Islander Black Other White
sex
Female 36 33 37 29 35
Male 35 37 36 32 38

In [30]:
import pandasql as pdsql

In [32]:
str1="select *  from adult limit 3;"

In [33]:
df1=pdsql.sqldf(str1)
df1


Out[33]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country income
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K

In [ ]: