In [2]:
# import sys; sys.path.append('.')
from setup import *
%matplotlib inline

In [3]:
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)



In [4]:
print('Loading previously "cleaned" tweets (could take a minute or so)...')
df = pd.read_csv(os.path.join(DATA_PATH, 'cleaned_tweets.csv.gz'), index_col='id', compression='gzip',
                 quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC, low_memory=False)
print('Loaded {} tweets.'.format(len(df)))


Loading previously "cleaned" tweets (could take a minute or so)...
Loaded 193378 tweets.

In [5]:
print('df.describe() stats:')
short_desc = df.describe()
for col, stats in short_desc.T.iterrows():
    print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col])))
    print(dict(zip(list(stats.index.values[[0,1,2,3,7]].T), list(stats.values[[0,1,2,3,7]].T))))


df.describe() stats:
/home/hobs/.virtualenvs/AgileMachineLearning/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
favorite_count (int64)
{'max': 1165.0, 'min': 0.0, 'mean': 0.62967866044741383, 'std': 6.2513185226973791, 'count': 193378.0}
id_str (int64)
{'max': 7.345638826146775e+17, 'min': 7.2143063523191194e+17, 'mean': 7.2748881589661248e+17, 'std': 3778481285444856.5, 'count': 193378.0}
in_reply_to_status_id (float64)
{'max': 7.345608471215145e+17, 'min': 23191365028.0, 'mean': 7.270313189507031e+17, 'std': 14858768435192046.0, 'count': 11165.0}
in_reply_to_status_id_str (float64)
{'max': 7.345608471215145e+17, 'min': 23191365028.0, 'mean': 7.270313189507031e+17, 'std': 14858768435192046.0, 'count': 11165.0}
in_reply_to_user_id (float64)
{'max': 7.3382480402583552e+17, 'min': 409.0, 'mean': 22451025124249248.0, 'std': 1.249313082801933e+17, 'count': 13007.0}
in_reply_to_user_id_str (float64)
{'max': 7.3382480402583552e+17, 'min': 409.0, 'mean': 22451025124249248.0, 'std': 1.249313082801933e+17, 'count': 13007.0}
lat (float64)
{'max': 59.800736800000003, 'min': -37.78433338, 'mean': 33.957110546951782, 'std': 16.306535104738987, 'count': 643.0}
lon (float64)
{'max': 151.73564382000001, 'min': -123.01192349, 'mean': -57.314729004883347, 'std': 70.435494395994823, 'count': 643.0}
quoted_status_favorite_count (float64)
{'max': 109888.0, 'min': 0.0, 'mean': 361.76914016489991, 'std': 3222.5511177090816, 'count': 1698.0}
quoted_status_retweet_count (float64)
{'max': 84527.0, 'min': 0.0, 'mean': 298.00647820965844, 'std': 2681.6341400580304, 'count': 1698.0}
quoted_status_user_favourites_count (float64)
{'max': 640291.0, 'min': 0.0, 'mean': 7707.1395759717316, 'std': 26101.078683741667, 'count': 1698.0}
quoted_status_user_followers_count (float64)
{'max': 59921020.0, 'min': 7.0, 'mean': 335107.81978798588, 'std': 2200700.2792996545, 'count': 1698.0}
quoted_status_user_friends_count (float64)
{'max': 181261.0, 'min': 0.0, 'mean': 3878.3027090694936, 'std': 14818.43093341756, 'count': 1698.0}
quoted_status_user_id (float64)
{'max': 7.3224109876099994e+17, 'min': 4816.0, 'mean': 23150203266414092.0, 'std': 1.2658212835104374e+17, 'count': 1698.0}
quoted_status_user_id_str (float64)
{'max': 7.322410987608023e+17, 'min': 4816.0, 'mean': 23150203266414028.0, 'std': 1.2658212835104352e+17, 'count': 1698.0}
quoted_status_user_listed_count (float64)
{'max': 173929.0, 'min': 0.0, 'mean': 3456.3374558303885, 'std': 17113.085283341832, 'count': 1698.0}
quoted_status_user_statuses_count (float64)
{'max': 354746.0, 'min': 4.0, 'mean': 26693.128386336866, 'std': 49781.314248201052, 'count': 1698.0}
quoted_status_user_utc_offset (float64)
{'max': 43200.0, 'min': -39600.0, 'mean': -8585.2062588904701, 'std': 16993.135149213045, 'count': 1406.0}
retweet_count (int64)
{'max': 166648.0, 'min': 0.0, 'mean': 53.567846394108948, 'std': 877.49740412536721, 'count': 193378.0}
retweeted_status_favorite_count (float64)
{'max': 215360.0, 'min': 0.0, 'mean': 156.26144073289831, 'std': 1597.5166348846326, 'count': 69423.0}
retweeted_status_id (float64)
{'max': 7.3456245703e+17, 'min': 1140597519.0, 'mean': 7.2460575642406733e+17, 'std': 29007908869735376.0, 'count': 69423.0}
retweeted_status_id_str (float64)
{'max': 7.3456245702961152e+17, 'min': 1140597519.0, 'mean': 7.2460575642405632e+17, 'std': 29007908869735456.0, 'count': 69423.0}
retweeted_status_in_reply_to_status_id (float64)
{'max': 7.3452723504286106e+17, 'min': 3613581976.0, 'mean': 7.2422442939467597e+17, 'std': 29854853455039460.0, 'count': 2295.0}
retweeted_status_in_reply_to_status_id_str (float64)
{'max': 7.3452723504286106e+17, 'min': 3613581976.0, 'mean': 7.2422442939467597e+17, 'std': 29854853455039460.0, 'count': 2295.0}
retweeted_status_in_reply_to_user_id (float64)
{'max': 7.2733294780473754e+17, 'min': 5339.0, 'mean': 11013487585371126.0, 'std': 88240954573208480.0, 'count': 2802.0}
retweeted_status_in_reply_to_user_id_str (float64)
{'max': 7.2733294780473754e+17, 'min': 5339.0, 'mean': 11013487585371126.0, 'std': 88240954573208480.0, 'count': 2802.0}
retweeted_status_quoted_status_favorite_count (float64)
{'max': 311618.0, 'min': 0.0, 'mean': 294.7160037002775, 'std': 6840.5942490111884, 'count': 2162.0}
retweeted_status_quoted_status_retweet_count (float64)
{'max': 406556.0, 'min': 0.0, 'mean': 319.83857539315449, 'std': 8842.6321156302329, 'count': 2162.0}
retweeted_status_quoted_status_user_favourites_count (float64)
{'max': 269482.0, 'min': 0.0, 'mean': 4255.1307189542486, 'std': 13101.264209088928, 'count': 1071.0}
retweeted_status_quoted_status_user_followers_count (float64)
{'max': 6762776.0, 'min': 7.0, 'mean': 674082.1176470588, 'std': 1972163.1979931768, 'count': 1071.0}
retweeted_status_quoted_status_user_friends_count (float64)
{'max': 96189.0, 'min': 0.0, 'mean': 3565.9813258636786, 'std': 13356.713457320231, 'count': 1071.0}
retweeted_status_quoted_status_user_id (float64)
{'max': 7.28732752418e+17, 'min': 5339.0, 'mean': 43838841893600248.0, 'std': 1.7115560262246653e+17, 'count': 1071.0}
retweeted_status_quoted_status_user_id_str (float64)
{'max': 7.2873275241836134e+17, 'min': 5339.0, 'mean': 43838841893618480.0, 'std': 1.711556026225375e+17, 'count': 1071.0}
retweeted_status_quoted_status_user_listed_count (float64)
{'max': 21608.0, 'min': 0.0, 'mean': 1733.9551820728291, 'std': 3274.3065422420227, 'count': 1071.0}
retweeted_status_quoted_status_user_statuses_count (float64)
{'max': 354746.0, 'min': 4.0, 'mean': 43782.852474323059, 'std': 81090.258294518164, 'count': 1071.0}
retweeted_status_quoted_status_user_utc_offset (float64)
{'max': 43200.0, 'min': -36000.0, 'mean': -5814.241486068111, 'std': 17857.526235499183, 'count': 969.0}
retweeted_status_retweet_count (float64)
{'max': 166648.0, 'min': 1.0, 'mean': 148.14081788456275, 'std': 1459.7345420253816, 'count': 69423.0}
retweeted_status_user_favourites_count (float64)
{'max': 424498.0, 'min': 0.0, 'mean': 7552.0748743211907, 'std': 22786.294974404056, 'count': 69423.0}
retweeted_status_user_followers_count (float64)
{'max': 38507544.0, 'min': 0.0, 'mean': 142060.08318568775, 'std': 869740.8912563012, 'count': 69423.0}
retweeted_status_user_friends_count (float64)
{'max': 4717316.0, 'min': 0.0, 'mean': 7067.4456016017748, 'std': 34761.613571936403, 'count': 69423.0}
retweeted_status_user_id (float64)
{'max': 7.34092058672e+17, 'min': 22.0, 'mean': 30750178123179024.0, 'std': 1.4508188361866784e+17, 'count': 69423.0}
retweeted_status_user_id_str (float64)
{'max': 7.3409205867213619e+17, 'min': 22.0, 'mean': 30750178123180928.0, 'std': 1.4508188361867664e+17, 'count': 69423.0}
retweeted_status_user_listed_count (float64)
{'max': 173930.0, 'min': 0.0, 'mean': 1311.7073160191867, 'std': 5366.9508868242192, 'count': 69423.0}
retweeted_status_user_statuses_count (float64)
{'max': 1277548.0, 'min': 1.0, 'mean': 24416.755412471372, 'std': 53995.675407694944, 'count': 69423.0}
retweeted_status_user_utc_offset (float64)
{'max': 46800.0, 'min': -39600.0, 'mean': -8987.895079527485, 'std': 15669.797441958659, 'count': 53755.0}
user_favourites_count (int64)
{'max': 673894.0, 'min': 0.0, 'mean': 2981.3527495371759, 'std': 12739.631357957256, 'count': 193378.0}
user_followers_count (int64)
{'max': 10383944.0, 'min': 0.0, 'mean': 3443.8566176090353, 'std': 59435.460778443674, 'count': 193378.0}
user_friends_count (int64)
{'max': 382464.0, 'min': -5.0, 'mean': 1428.3015699821076, 'std': 5848.6786386995682, 'count': 193378.0}
user_id (int64)
{'max': 7.3422048072768307e+17, 'min': 22.0, 'mean': 57247007060732848.0, 'std': 1.9421740629431414e+17, 'count': 193378.0}
user_id_str (int64)
{'max': 7.3422048072768307e+17, 'min': 22.0, 'mean': 57247007060732848.0, 'std': 1.9421740629431414e+17, 'count': 193378.0}
user_listed_count (int64)
{'max': 129229.0, 'min': 0.0, 'mean': 353.92574646547178, 'std': 1126.6207791892587, 'count': 193378.0}
user_statuses_count (int64)
{'max': 2537204.0, 'min': 1.0, 'mean': 61341.779126891372, 'std': 138271.15238577841, 'count': 193378.0}
user_utc_offset (float64)
{'max': 46800.0, 'min': -39600.0, 'mean': -6377.1225523550311, 'std': 18027.9532895392, 'count': 119043.0}
quoted_status_id (float64)
{'max': nan, 'min': nan, 'mean': nan, 'std': nan, 'count': 0.0}
quoted_status_id_str (float64)
{'max': nan, 'min': nan, 'mean': nan, 'std': nan, 'count': 0.0}
retweeted_status_quoted_status_id (float64)
{'max': nan, 'min': nan, 'mean': nan, 'std': nan, 'count': 0.0}
retweeted_status_quoted_status_id_str (float64)
{'max': nan, 'min': nan, 'mean': nan, 'std': nan, 'count': 0.0}
favorite_count.1 (int64)
{'max': 1165.0, 'min': 0.0, 'mean': 0.62967866044741383, 'std': 6.2513185226973791, 'count': 193378.0}

In [6]:
# this takes a few minutes
print('Using pandas_profiling to generate more detailed stats, including correlation between columns, skew etc')
# pandas_profiling.ProfileReport raises Tkinter exceptions before it can produce any output,
#  at least describe produces a dataframe of stats
desc = pandas_profiling.describe(df)
desc['table']
# for col, stats in desc['variables'].iterrows():
#     print('')
#     print(col)
#     print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col])))
#     print(stats)

# and if you thought that was tough to read, try printing out all the report['freq'] dicts of histograms


Using pandas_profiling to generate more detailed stats, including correlation between columns, skew etc
Out[6]:
{'CAT': 2,
 'CONST': 4,
 'CORR': 17,
 'DATE': 0,
 'NUM': 38,
 'REJECTED': 21,
 'UNIQUE': 0,
 'memsize': '90.0 MiB',
 'n': 193378,
 'n_duplicates': 447,
 'nvar': 61,
 'recordsize': '488.0 B',
 'total_missing': 0.49723492373469169}

In [7]:
desc['variables']


Out[7]:
type correlation_var correlation count distinct_count is_unique memorysize mode n_infinite n_missing p_infinite p_missing p_unique top freq 25% 5% 50% 75% 95% cv histogram iqr kurtosis mad max mean min mini_histogram n_zeros p_zeros range skewness std sum variance
favorite_count NUM NaN NaN 193378 148 False 1547104 0 0 0 0 0 0.00076534 NaN NaN 0 0 0 0 2 9.92779 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 0 12880.5 1.02987 1165 0.629679 0 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 158139 0.817771 1165 89.7989 6.25132 121766 39.079
favorite_count.1 CORR favorite_count 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
geo_coordinates CAT NaN NaN 643 304 False 1547104 [42.3600825, -71.0588801] 0 192735 0 0.996675 0.472784 [42.3600825, -71.0588801] 26 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
id NUM NaN NaN 193378 183070 False 1547104 724249718457548801 0 0 0 0 0.946695 NaN NaN 7.24242e+17 7.22075e+17 7.27197e+17 7.30132e+17 7.33768e+17 0.00519387 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 5.88999e+15 -1.10308 3.18786e+15 734563882614677504 7.27489e+17 721430635231911936 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 0 0 13133247382765568 0.253459 3.77848e+15 5461934346094412227 1.42769e+31
id_str CORR id 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
in_reply_to_status_id NUM NaN NaN 11165 10062 False 1547104 7.31411e+17 0 182213 0 0.942263 0.901209 NaN NaN 7.24486e+17 7.22009e+17 7.27255e+17 7.31397e+17 7.33801e+17 0.0204376 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 6.911e+15 1568.15 3.86773e+15 7.34561e+17 7.27031e+17 2.31914e+10 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 0 0 7.34561e+17 -35.2158 1.48588e+16 8.1173e+21 2.20783e+32
in_reply_to_status_id_str CORR in_reply_to_status_id 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
in_reply_to_user_id NUM NaN NaN 13007 8562 False 1547104 3.0085e+09 0 180371 0 0.932738 0.658261 NaN NaN 2.75862e+07 9.29635e+06 2.64281e+08 1.89832e+09 4.46374e+09 5.56461 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 1.87073e+09 27.0189 4.3497e+16 7.33825e+17 2.2451e+16 409 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 0 0 7.33825e+17 5.38611 1.24931e+17 2.9202e+20 1.56078e+34
in_reply_to_user_id_str CORR in_reply_to_user_id 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
lat NUM NaN NaN 643 304 False 1547104 42.3601 0 192735 0 0.996675 0.472784 NaN NaN 33.6589 -6.21167 37.7749 41.824 51.7404 0.48021 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 8.16509 4.98192 10.3333 59.8007 33.9571 -37.7843 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 0 0 97.5851 -2.17083 16.3065 21834.4 265.903
lon NUM NaN NaN 643 304 False 1547104 -71.0589 0 192735 0 0.996675 0.472784 NaN NaN -105.087 -122.246 -77.9491 -29.8896 107.615 -1.22892 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 75.1971 0.938622 54.8938 151.736 -57.3147 -123.012 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 0 0 274.748 1.40557 70.4355 -36853.4 4961.16
quoted_status_favorite_count NUM NaN NaN 1698 274 False 1547104 0 0 191680 0 0.991219 0.161366 NaN NaN 1 0 7 48 1227.85 8.90776 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 47 798.828 603.404 109888 361.769 0 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 282 0.00145828 109888 25.0669 3222.55 614284 1.03848e+07
quoted_status_id CONST NaN NaN 0 1 False 1547104 NaN 0 193378 0 1 inf NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
quoted_status_id_str CONST NaN NaN 0 1 False 1547104 NaN 0 193378 0 1 inf NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
quoted_status_retweet_count CORR quoted_status_favorite_count 0.984934 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
quoted_status_user_favourites_count NUM NaN NaN 1698 1007 False 1547104 0 0 191680 0 0.991219 0.593051 NaN NaN 143 2 762.5 3615.25 37633.3 3.38661 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 3472.25 219.231 10849.2 640291 7707.14 0 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 62 0.000320616 640291 11.2769 26101.1 1.30867e+07 6.81266e+08
quoted_status_user_followers_count NUM NaN NaN 1698 1243 False 1547104 7.03267e+06 0 191680 0 0.991219 0.732038 NaN NaN 654.25 87 3406.5 39719 784927 6.56714 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 39064.8 385.685 566569 5.9921e+07 335108 7 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 0 0 5.9921e+07 16.929 2.2007e+06 5.69013e+08 4.84308e+12
quoted_status_user_friends_count NUM NaN NaN 1698 934 False 1547104 1 0 191680 0 0.991219 0.550059 NaN NaN 194 7 482 1482.25 11083.6 3.82085 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 1288.25 43.7724 5600.18 181261 3878.3 0 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 30 0.000155137 181261 6.18651 14818.4 6.58536e+06 2.19586e+08
quoted_status_user_id NUM NaN NaN 1698 1162 False 1547104 3.14878e+09 0 191680 0 0.991219 0.684335 NaN NaN 2.94043e+07 6.2897e+06 2.80974e+08 1.97674e+09 4.40222e+09 5.46786 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 1.94733e+09 26.0156 4.48007e+16 7.32241e+17 2.31502e+16 4816 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 0 0 7.32241e+17 5.28935 1.26582e+17 3.9309e+19 1.6023e+34
quoted_status_user_id_str CORR quoted_status_user_id 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
quoted_status_user_listed_count NUM NaN NaN 1698 639 False 1547104 0 0 191680 0 0.991219 0.376325 NaN NaN 29 2 143.5 815.25 6522.8 4.95122 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 786.25 48.7525 5387.76 173929 3456.34 0 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 54 0.000279246 173929 6.95377 17113.1 5.86886e+06 2.92858e+08
quoted_status_user_statuses_count NUM NaN NaN 1698 1287 False 1547104 193095 0 191680 0 0.991219 0.757951 NaN NaN 1687.25 136.4 5939 24840.8 140758 1.86495 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 23153.5 10.9608 31411.6 354746 26693.1 4 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 0 0 354742 3.10405 49781.3 4.53249e+07 2.47818e+09
quoted_status_user_utc_offset NUM NaN NaN 1406 24 False 1547104 -25200 0 191972 0 0.992729 0.0170697 NaN NaN -25200 -25200 -14400 3600 28800 -1.97935 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 28800 0.146581 14250.3 43200 -8585.21 -39600 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 27 0.000139623 82800 0.940753 16993.1 -1.20708e+07 2.88767e+08
retweet_count NUM NaN NaN 193378 837 False 1547104 0 0 0 0 0 0.00432831 NaN NaN 0 0 0 4 102 16.381 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 4 12541.7 92.4298 166648 53.5678 0 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 99109 0.512514 166648 92.0421 877.497 10358843 770002
retweeted_status_favorite_count CORR retweet_count 0.91551 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
retweeted_status_id NUM NaN NaN 69423 21016 False 1547104 7.33324e+17 0 123955 0 0.640998 0.302724 NaN NaN 7.23781e+17 7.21277e+17 7.26874e+17 7.3002e+17 7.3364e+17 0.0400327 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 6.23958e+15 322.608 6.15579e+15 7.34562e+17 7.24606e+17 1.1406e+09 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 0 0 7.34562e+17 -16.7698 2.90079e+16 5.03043e+22 8.41459e+32
retweeted_status_id_str CORR retweeted_status_id 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
retweeted_status_in_reply_to_status_id CORR retweeted_status_id_str 0.994562 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
retweeted_status_in_reply_to_status_id_str CORR retweeted_status_in_reply_to_status_id 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
retweeted_status_in_reply_to_user_id NUM NaN NaN 2802 1271 False 1547104 3.04172e+07 0 190576 0 0.98551 0.453605 NaN NaN 3.04172e+07 1.29689e+07 2.17203e+08 9.00048e+08 3.28169e+09 8.01208 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 8.69631e+08 60.319 2.16889e+16 7.27333e+17 1.10135e+16 5339 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 0 0 7.27333e+17 7.89102 8.8241e+16 3.08598e+19 7.78647e+33
retweeted_status_in_reply_to_user_id_str CORR retweeted_status_in_reply_to_user_id 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
retweeted_status_quoted_status_favorite_count NUM NaN NaN 2162 132 False 1547104 114 0 191216 0 0.98882 0.0610546 NaN NaN 6 0 114 114 220 23.2108 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 108 1989.7 434.843 311618 294.716 0 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 148 0.00076534 311618 43.9257 6840.59 637176 4.67937e+07
retweeted_status_quoted_status_id CONST NaN NaN 0 1 False 1547104 NaN 0 193378 0 1 inf NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
retweeted_status_quoted_status_id_str CONST NaN NaN 0 1 False 1547104 NaN 0 193378 0 1 inf NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
retweeted_status_quoted_status_retweet_count CORR retweeted_status_quoted_status_favorite_count 0.998323 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
retweeted_status_quoted_status_user_favourites_count NUM NaN NaN 1071 347 False 1547104 0 0 192307 0 0.994462 0.323996 NaN NaN 71 0 388 3088.5 17000 3.07893 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 3017.5 168.397 5802.46 269482 4255.13 0 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 117 0.000605033 269482 10.2363 13101.3 4.55724e+06 1.71643e+08
retweeted_status_quoted_status_user_followers_count CORR retweeted_status_favorite_count 0.994986 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
retweeted_status_quoted_status_user_friends_count NUM NaN NaN 1071 335 False 1547104 23 0 192307 0 0.994462 0.312792 NaN NaN 110 23 638 1392 15806 3.74559 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 1282 36.6869 5096.08 96189 3565.98 0 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 18 9.30819e-05 96189 5.9608 13356.7 3.81917e+06 1.78402e+08
retweeted_status_quoted_status_user_id NUM NaN NaN 1071 341 False 1547104 3.70345e+07 0 192307 0 0.994462 0.318394 NaN NaN 1.84834e+07 2.31156e+06 8.79152e+07 8.30324e+08 7.08559e+17 3.9042 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 8.11841e+08 11.3568 8.22746e+16 7.28733e+17 4.38388e+16 5339 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 0 0 7.28733e+17 3.65159 1.71156e+17 4.69514e+19 2.92942e+34
retweeted_status_quoted_status_user_id_str CORR retweeted_status_quoted_status_user_id 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
retweeted_status_quoted_status_user_listed_count NUM NaN NaN 1071 265 False 1547104 10429 0 192307 0 0.994462 0.247432 NaN NaN 45 3.5 201 849 10429 1.88835 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 804 3.36035 2384.45 21608 1733.96 0 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 12 6.20546e-05 21608 2.0779 3274.31 1.85707e+06 1.07211e+07
retweeted_status_quoted_status_user_statuses_count NUM NaN NaN 1071 394 False 1547104 259036 0 192307 0 0.994462 0.36788 NaN NaN 1324 137 9655 30485 259036 1.8521 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 29161 3.10496 55303.7 354746 43782.9 4 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 0 0 354742 2.15197 81090.3 4.68914e+07 6.57563e+09
retweeted_status_quoted_status_user_utc_offset NUM NaN NaN 969 19 False 1547104 -25200 0 192409 0 0.994989 0.0196078 NaN NaN -21600 -25200 -14400 7200 19800 -3.07134 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 28800 -0.867897 15921.1 43200 -5814.24 -36000 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 8 4.13698e-05 79200 0.441733 17857.5 -5.634e+06 3.18891e+08
retweeted_status_retweet_count CORR retweeted_status_quoted_status_user_followers_... 0.996464 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
retweeted_status_user_favourites_count NUM NaN NaN 69423 4678 False 1547104 0 0 123955 0 0.640998 0.067384 NaN NaN 60 0 532 3432 34897 3.01722 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 3372 71.0348 10788.7 424498 7552.07 0 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 6745 0.0348799 424498 6.72324 22786.3 5.24288e+08 5.19215e+08
retweeted_status_user_followers_count NUM NaN NaN 69423 6635 False 1547104 699897 0 123955 0 0.640998 0.0955735 NaN NaN 807.5 72 4029 34659 699884 6.12235 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 33851.5 457.858 223830 3.85075e+07 142060 0 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 18 9.30819e-05 3.85075e+07 17.2514 869741 9.86224e+09 7.56449e+11
retweeted_status_user_friends_count NUM NaN NaN 69423 3698 False 1547104 0 0 123955 0 0.640998 0.0532676 NaN NaN 188 5 608 2749 69682 4.91855 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 2561 10652.6 10431.9 4.71732e+06 7067.45 0 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 1890 0.0097736 4.71732e+06 83.1461 34761.6 4.90643e+08 1.20837e+09
retweeted_status_user_id NUM NaN NaN 69423 10376 False 1547104 3.04175e+07 0 123955 0 0.640998 0.149461 NaN NaN 3.70534e+07 1.41749e+07 3.59035e+08 2.29318e+09 4.8208e+09 4.71808 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 2.25613e+09 18.3147 5.8856e+16 7.34092e+17 3.07502e+16 22 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 0 0 7.34092e+17 4.50679 1.45082e+17 2.13477e+21 2.10488e+34
retweeted_status_user_id_str CORR retweeted_status_user_id 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
retweeted_status_user_listed_count NUM NaN NaN 69423 1835 False 1547104 797 0 123955 0 0.640998 0.0264322 NaN NaN 61 4 241 845 4420 4.09158 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 784 288.735 1654.3 173930 1311.71 0 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 1148 0.00593656 173930 14.8771 5366.95 9.10627e+07 2.88042e+07
retweeted_status_user_statuses_count NUM NaN NaN 69423 9782 False 1547104 4749 0 123955 0 0.640998 0.140904 NaN NaN 1912 130 6757 25271 97449.1 2.21142 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 23359 80.9361 27625 1.27755e+06 24416.8 1 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 0 0 1.27755e+06 6.97158 53995.7 1.69508e+09 2.91553e+09
retweeted_status_user_utc_offset NUM NaN NaN 53755 31 False 1547104 -14400 0 139623 0 0.722021 0.000576691 NaN NaN -25200 -25200 -14400 3600 19800 -1.74343 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 28800 -0.0880758 13368.7 46800 -8987.9 -39600 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 165 0.000853251 86400 0.825304 15669.8 -4.83144e+08 2.45543e+08
text CAT NaN NaN 193378 130674 False 1547104 RT @daniel_bilar: Fake participation in conf c... 0 0 0 0 0.675744 RT @daniel_bilar: Fake participation in conf c... 1049 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
user_favourites_count NUM NaN NaN 193378 14432 False 1547104 0 0 0 0 0 0.074631 NaN NaN 1 0 73 1020.75 13144 4.2731 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 1019.75 249.705 4522.48 673894 2981.35 0 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 44441 0.229814 673894 11.6228 12739.6 576528032 1.62298e+08
user_followers_count NUM NaN NaN 193378 10069 False 1547104 249 0 0 0 0 0.052069 NaN NaN 152 25 448 1141 8835 17.2584 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 989 14135.4 5023.65 10383944 3443.86 0 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 162 0.000837737 10383944 108.909 59435.5 665966105 3.53257e+09
user_friends_count NUM NaN NaN 193378 6384 False 1547104 0 0 0 0 0 0.0330131 NaN NaN 83 0 343 1090 3945 4.09485 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 1007 572.551 1725.22 382464 1428.3 -5 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 10204 0.0527671 382469 17.8992 5848.68 276202101 3.4207e+07
user_id NUM NaN NaN 193378 66076 False 1547104 4638112776 0 0 0 0 0.341693 NaN NaN 1.98668e+08 1.59798e+07 1.49294e+09 3.33342e+09 7.1329e+17 3.39262 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 3.13475e+09 7.60179 1.05341e+17 734220480727683079 5.7247e+16 22 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 0 0 734220480727683057 3.09837 1.94217e+17 2265287164665850537 3.77204e+34
user_id_str CORR user_id 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
user_listed_count NUM NaN NaN 193378 2703 False 1547104 0 0 0 0 0 0.0139778 NaN NaN 16 1 78 248 1304.15 3.18321 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 232 1348.17 436.975 129229 353.926 0 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 6974 0.0360641 129229 19.3917 1126.62 68441453 1.26927e+06
user_statuses_count NUM NaN NaN 193378 37874 False 1547104 11545 0 0 0 0 0.195855 NaN NaN 1887 141 11136 57997 280665 2.25411 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 56110 26.9205 76620.4 2537204 61341.8 1 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 0 0 2537203 4.49368 138271 11862150564 1.91189e+10
user_utc_offset NUM NaN NaN 119043 32 False 1547104 -25200 0 74335 0 0.384403 0.00026881 NaN NaN -25200 -25200 -14400 7200 28800 -2.82697 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 32400 -0.726381 15957.5 46800 -6377.12 -39600 data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA... 686 0.00354746 86400 0.586578 18028 -7.59152e+08 3.25007e+08

In [8]:
desc['table']


Out[8]:
{'CAT': 2,
 'CONST': 4,
 'CORR': 17,
 'DATE': 0,
 'NUM': 38,
 'REJECTED': 21,
 'UNIQUE': 0,
 'memsize': '90.0 MiB',
 'n': 193378,
 'n_duplicates': 447,
 'nvar': 61,
 'recordsize': '488.0 B',
 'total_missing': 0.49723492373469169}

In [9]:
# desc.keys()
html = pandas_profiling.to_html(df.head(), desc).encode('utf8')
with open('report.html', 'w') as fout:
    fout.write(html)
display(HTML(html))
# report = pandas_profiling.ProfileReport(df)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-9-e3c1a3e88899> in <module>()
      2 html = pandas_profiling.to_html(df.head(), desc).encode('utf8')
      3 with open('report.html', 'w') as fout:
----> 4     fout.write(html)
      5 display(HTML(html))
      6 # report = pandas_profiling.ProfileReport(df)

TypeError: write() argument must be str, not bytes

In [ ]: