In [2]:
# import sys; sys.path.append('.')
from setup import *
%matplotlib inline
In [3]:
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)
In [4]:
print('Loading previously "cleaned" tweets (could take a minute or so)...')
df = pd.read_csv(os.path.join(DATA_PATH, 'cleaned_tweets.csv.gz'), index_col='id', compression='gzip',
quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC, low_memory=False)
print('Loaded {} tweets.'.format(len(df)))
Loading previously "cleaned" tweets (could take a minute or so)...
Loaded 193378 tweets.
In [5]:
print('df.describe() stats:')
short_desc = df.describe()
for col, stats in short_desc.T.iterrows():
print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col])))
print(dict(zip(list(stats.index.values[[0,1,2,3,7]].T), list(stats.values[[0,1,2,3,7]].T))))
df.describe() stats:
/home/hobs/.virtualenvs/AgileMachineLearning/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
RuntimeWarning)
favorite_count (int64)
{'max': 1165.0, 'min': 0.0, 'mean': 0.62967866044741383, 'std': 6.2513185226973791, 'count': 193378.0}
id_str (int64)
{'max': 7.345638826146775e+17, 'min': 7.2143063523191194e+17, 'mean': 7.2748881589661248e+17, 'std': 3778481285444856.5, 'count': 193378.0}
in_reply_to_status_id (float64)
{'max': 7.345608471215145e+17, 'min': 23191365028.0, 'mean': 7.270313189507031e+17, 'std': 14858768435192046.0, 'count': 11165.0}
in_reply_to_status_id_str (float64)
{'max': 7.345608471215145e+17, 'min': 23191365028.0, 'mean': 7.270313189507031e+17, 'std': 14858768435192046.0, 'count': 11165.0}
in_reply_to_user_id (float64)
{'max': 7.3382480402583552e+17, 'min': 409.0, 'mean': 22451025124249248.0, 'std': 1.249313082801933e+17, 'count': 13007.0}
in_reply_to_user_id_str (float64)
{'max': 7.3382480402583552e+17, 'min': 409.0, 'mean': 22451025124249248.0, 'std': 1.249313082801933e+17, 'count': 13007.0}
lat (float64)
{'max': 59.800736800000003, 'min': -37.78433338, 'mean': 33.957110546951782, 'std': 16.306535104738987, 'count': 643.0}
lon (float64)
{'max': 151.73564382000001, 'min': -123.01192349, 'mean': -57.314729004883347, 'std': 70.435494395994823, 'count': 643.0}
quoted_status_favorite_count (float64)
{'max': 109888.0, 'min': 0.0, 'mean': 361.76914016489991, 'std': 3222.5511177090816, 'count': 1698.0}
quoted_status_retweet_count (float64)
{'max': 84527.0, 'min': 0.0, 'mean': 298.00647820965844, 'std': 2681.6341400580304, 'count': 1698.0}
quoted_status_user_favourites_count (float64)
{'max': 640291.0, 'min': 0.0, 'mean': 7707.1395759717316, 'std': 26101.078683741667, 'count': 1698.0}
quoted_status_user_followers_count (float64)
{'max': 59921020.0, 'min': 7.0, 'mean': 335107.81978798588, 'std': 2200700.2792996545, 'count': 1698.0}
quoted_status_user_friends_count (float64)
{'max': 181261.0, 'min': 0.0, 'mean': 3878.3027090694936, 'std': 14818.43093341756, 'count': 1698.0}
quoted_status_user_id (float64)
{'max': 7.3224109876099994e+17, 'min': 4816.0, 'mean': 23150203266414092.0, 'std': 1.2658212835104374e+17, 'count': 1698.0}
quoted_status_user_id_str (float64)
{'max': 7.322410987608023e+17, 'min': 4816.0, 'mean': 23150203266414028.0, 'std': 1.2658212835104352e+17, 'count': 1698.0}
quoted_status_user_listed_count (float64)
{'max': 173929.0, 'min': 0.0, 'mean': 3456.3374558303885, 'std': 17113.085283341832, 'count': 1698.0}
quoted_status_user_statuses_count (float64)
{'max': 354746.0, 'min': 4.0, 'mean': 26693.128386336866, 'std': 49781.314248201052, 'count': 1698.0}
quoted_status_user_utc_offset (float64)
{'max': 43200.0, 'min': -39600.0, 'mean': -8585.2062588904701, 'std': 16993.135149213045, 'count': 1406.0}
retweet_count (int64)
{'max': 166648.0, 'min': 0.0, 'mean': 53.567846394108948, 'std': 877.49740412536721, 'count': 193378.0}
retweeted_status_favorite_count (float64)
{'max': 215360.0, 'min': 0.0, 'mean': 156.26144073289831, 'std': 1597.5166348846326, 'count': 69423.0}
retweeted_status_id (float64)
{'max': 7.3456245703e+17, 'min': 1140597519.0, 'mean': 7.2460575642406733e+17, 'std': 29007908869735376.0, 'count': 69423.0}
retweeted_status_id_str (float64)
{'max': 7.3456245702961152e+17, 'min': 1140597519.0, 'mean': 7.2460575642405632e+17, 'std': 29007908869735456.0, 'count': 69423.0}
retweeted_status_in_reply_to_status_id (float64)
{'max': 7.3452723504286106e+17, 'min': 3613581976.0, 'mean': 7.2422442939467597e+17, 'std': 29854853455039460.0, 'count': 2295.0}
retweeted_status_in_reply_to_status_id_str (float64)
{'max': 7.3452723504286106e+17, 'min': 3613581976.0, 'mean': 7.2422442939467597e+17, 'std': 29854853455039460.0, 'count': 2295.0}
retweeted_status_in_reply_to_user_id (float64)
{'max': 7.2733294780473754e+17, 'min': 5339.0, 'mean': 11013487585371126.0, 'std': 88240954573208480.0, 'count': 2802.0}
retweeted_status_in_reply_to_user_id_str (float64)
{'max': 7.2733294780473754e+17, 'min': 5339.0, 'mean': 11013487585371126.0, 'std': 88240954573208480.0, 'count': 2802.0}
retweeted_status_quoted_status_favorite_count (float64)
{'max': 311618.0, 'min': 0.0, 'mean': 294.7160037002775, 'std': 6840.5942490111884, 'count': 2162.0}
retweeted_status_quoted_status_retweet_count (float64)
{'max': 406556.0, 'min': 0.0, 'mean': 319.83857539315449, 'std': 8842.6321156302329, 'count': 2162.0}
retweeted_status_quoted_status_user_favourites_count (float64)
{'max': 269482.0, 'min': 0.0, 'mean': 4255.1307189542486, 'std': 13101.264209088928, 'count': 1071.0}
retweeted_status_quoted_status_user_followers_count (float64)
{'max': 6762776.0, 'min': 7.0, 'mean': 674082.1176470588, 'std': 1972163.1979931768, 'count': 1071.0}
retweeted_status_quoted_status_user_friends_count (float64)
{'max': 96189.0, 'min': 0.0, 'mean': 3565.9813258636786, 'std': 13356.713457320231, 'count': 1071.0}
retweeted_status_quoted_status_user_id (float64)
{'max': 7.28732752418e+17, 'min': 5339.0, 'mean': 43838841893600248.0, 'std': 1.7115560262246653e+17, 'count': 1071.0}
retweeted_status_quoted_status_user_id_str (float64)
{'max': 7.2873275241836134e+17, 'min': 5339.0, 'mean': 43838841893618480.0, 'std': 1.711556026225375e+17, 'count': 1071.0}
retweeted_status_quoted_status_user_listed_count (float64)
{'max': 21608.0, 'min': 0.0, 'mean': 1733.9551820728291, 'std': 3274.3065422420227, 'count': 1071.0}
retweeted_status_quoted_status_user_statuses_count (float64)
{'max': 354746.0, 'min': 4.0, 'mean': 43782.852474323059, 'std': 81090.258294518164, 'count': 1071.0}
retweeted_status_quoted_status_user_utc_offset (float64)
{'max': 43200.0, 'min': -36000.0, 'mean': -5814.241486068111, 'std': 17857.526235499183, 'count': 969.0}
retweeted_status_retweet_count (float64)
{'max': 166648.0, 'min': 1.0, 'mean': 148.14081788456275, 'std': 1459.7345420253816, 'count': 69423.0}
retweeted_status_user_favourites_count (float64)
{'max': 424498.0, 'min': 0.0, 'mean': 7552.0748743211907, 'std': 22786.294974404056, 'count': 69423.0}
retweeted_status_user_followers_count (float64)
{'max': 38507544.0, 'min': 0.0, 'mean': 142060.08318568775, 'std': 869740.8912563012, 'count': 69423.0}
retweeted_status_user_friends_count (float64)
{'max': 4717316.0, 'min': 0.0, 'mean': 7067.4456016017748, 'std': 34761.613571936403, 'count': 69423.0}
retweeted_status_user_id (float64)
{'max': 7.34092058672e+17, 'min': 22.0, 'mean': 30750178123179024.0, 'std': 1.4508188361866784e+17, 'count': 69423.0}
retweeted_status_user_id_str (float64)
{'max': 7.3409205867213619e+17, 'min': 22.0, 'mean': 30750178123180928.0, 'std': 1.4508188361867664e+17, 'count': 69423.0}
retweeted_status_user_listed_count (float64)
{'max': 173930.0, 'min': 0.0, 'mean': 1311.7073160191867, 'std': 5366.9508868242192, 'count': 69423.0}
retweeted_status_user_statuses_count (float64)
{'max': 1277548.0, 'min': 1.0, 'mean': 24416.755412471372, 'std': 53995.675407694944, 'count': 69423.0}
retweeted_status_user_utc_offset (float64)
{'max': 46800.0, 'min': -39600.0, 'mean': -8987.895079527485, 'std': 15669.797441958659, 'count': 53755.0}
user_favourites_count (int64)
{'max': 673894.0, 'min': 0.0, 'mean': 2981.3527495371759, 'std': 12739.631357957256, 'count': 193378.0}
user_followers_count (int64)
{'max': 10383944.0, 'min': 0.0, 'mean': 3443.8566176090353, 'std': 59435.460778443674, 'count': 193378.0}
user_friends_count (int64)
{'max': 382464.0, 'min': -5.0, 'mean': 1428.3015699821076, 'std': 5848.6786386995682, 'count': 193378.0}
user_id (int64)
{'max': 7.3422048072768307e+17, 'min': 22.0, 'mean': 57247007060732848.0, 'std': 1.9421740629431414e+17, 'count': 193378.0}
user_id_str (int64)
{'max': 7.3422048072768307e+17, 'min': 22.0, 'mean': 57247007060732848.0, 'std': 1.9421740629431414e+17, 'count': 193378.0}
user_listed_count (int64)
{'max': 129229.0, 'min': 0.0, 'mean': 353.92574646547178, 'std': 1126.6207791892587, 'count': 193378.0}
user_statuses_count (int64)
{'max': 2537204.0, 'min': 1.0, 'mean': 61341.779126891372, 'std': 138271.15238577841, 'count': 193378.0}
user_utc_offset (float64)
{'max': 46800.0, 'min': -39600.0, 'mean': -6377.1225523550311, 'std': 18027.9532895392, 'count': 119043.0}
quoted_status_id (float64)
{'max': nan, 'min': nan, 'mean': nan, 'std': nan, 'count': 0.0}
quoted_status_id_str (float64)
{'max': nan, 'min': nan, 'mean': nan, 'std': nan, 'count': 0.0}
retweeted_status_quoted_status_id (float64)
{'max': nan, 'min': nan, 'mean': nan, 'std': nan, 'count': 0.0}
retweeted_status_quoted_status_id_str (float64)
{'max': nan, 'min': nan, 'mean': nan, 'std': nan, 'count': 0.0}
favorite_count.1 (int64)
{'max': 1165.0, 'min': 0.0, 'mean': 0.62967866044741383, 'std': 6.2513185226973791, 'count': 193378.0}
In [6]:
# this takes a few minutes
print('Using pandas_profiling to generate more detailed stats, including correlation between columns, skew etc')
# pandas_profiling.ProfileReport raises Tkinter exceptions before it can produce any output,
# at least describe produces a dataframe of stats
desc = pandas_profiling.describe(df)
desc['table']
# for col, stats in desc['variables'].iterrows():
# print('')
# print(col)
# print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col])))
# print(stats)
# and if you thought that was tough to read, try printing out all the report['freq'] dicts of histograms
Using pandas_profiling to generate more detailed stats, including correlation between columns, skew etc
Out[6]:
{'CAT': 2,
'CONST': 4,
'CORR': 17,
'DATE': 0,
'NUM': 38,
'REJECTED': 21,
'UNIQUE': 0,
'memsize': '90.0 MiB',
'n': 193378,
'n_duplicates': 447,
'nvar': 61,
'recordsize': '488.0 B',
'total_missing': 0.49723492373469169}
In [7]:
desc['variables']
Out[7]:
type
correlation_var
correlation
count
distinct_count
is_unique
memorysize
mode
n_infinite
n_missing
p_infinite
p_missing
p_unique
top
freq
25%
5%
50%
75%
95%
cv
histogram
iqr
kurtosis
mad
max
mean
min
mini_histogram
n_zeros
p_zeros
range
skewness
std
sum
variance
favorite_count
NUM
NaN
NaN
193378
148
False
1547104
0
0
0
0
0
0.00076534
NaN
NaN
0
0
0
0
2
9.92779
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
0
12880.5
1.02987
1165
0.629679
0
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
158139
0.817771
1165
89.7989
6.25132
121766
39.079
favorite_count.1
CORR
favorite_count
1
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
geo_coordinates
CAT
NaN
NaN
643
304
False
1547104
[42.3600825, -71.0588801]
0
192735
0
0.996675
0.472784
[42.3600825, -71.0588801]
26
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
id
NUM
NaN
NaN
193378
183070
False
1547104
724249718457548801
0
0
0
0
0.946695
NaN
NaN
7.24242e+17
7.22075e+17
7.27197e+17
7.30132e+17
7.33768e+17
0.00519387
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
5.88999e+15
-1.10308
3.18786e+15
734563882614677504
7.27489e+17
721430635231911936
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
0
0
13133247382765568
0.253459
3.77848e+15
5461934346094412227
1.42769e+31
id_str
CORR
id
1
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
in_reply_to_status_id
NUM
NaN
NaN
11165
10062
False
1547104
7.31411e+17
0
182213
0
0.942263
0.901209
NaN
NaN
7.24486e+17
7.22009e+17
7.27255e+17
7.31397e+17
7.33801e+17
0.0204376
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
6.911e+15
1568.15
3.86773e+15
7.34561e+17
7.27031e+17
2.31914e+10
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
0
0
7.34561e+17
-35.2158
1.48588e+16
8.1173e+21
2.20783e+32
in_reply_to_status_id_str
CORR
in_reply_to_status_id
1
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
in_reply_to_user_id
NUM
NaN
NaN
13007
8562
False
1547104
3.0085e+09
0
180371
0
0.932738
0.658261
NaN
NaN
2.75862e+07
9.29635e+06
2.64281e+08
1.89832e+09
4.46374e+09
5.56461
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
1.87073e+09
27.0189
4.3497e+16
7.33825e+17
2.2451e+16
409
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
0
0
7.33825e+17
5.38611
1.24931e+17
2.9202e+20
1.56078e+34
in_reply_to_user_id_str
CORR
in_reply_to_user_id
1
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
lat
NUM
NaN
NaN
643
304
False
1547104
42.3601
0
192735
0
0.996675
0.472784
NaN
NaN
33.6589
-6.21167
37.7749
41.824
51.7404
0.48021
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
8.16509
4.98192
10.3333
59.8007
33.9571
-37.7843
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
0
0
97.5851
-2.17083
16.3065
21834.4
265.903
lon
NUM
NaN
NaN
643
304
False
1547104
-71.0589
0
192735
0
0.996675
0.472784
NaN
NaN
-105.087
-122.246
-77.9491
-29.8896
107.615
-1.22892
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
75.1971
0.938622
54.8938
151.736
-57.3147
-123.012
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
0
0
274.748
1.40557
70.4355
-36853.4
4961.16
quoted_status_favorite_count
NUM
NaN
NaN
1698
274
False
1547104
0
0
191680
0
0.991219
0.161366
NaN
NaN
1
0
7
48
1227.85
8.90776
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
47
798.828
603.404
109888
361.769
0
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
282
0.00145828
109888
25.0669
3222.55
614284
1.03848e+07
quoted_status_id
CONST
NaN
NaN
0
1
False
1547104
NaN
0
193378
0
1
inf
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
quoted_status_id_str
CONST
NaN
NaN
0
1
False
1547104
NaN
0
193378
0
1
inf
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
quoted_status_retweet_count
CORR
quoted_status_favorite_count
0.984934
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
quoted_status_user_favourites_count
NUM
NaN
NaN
1698
1007
False
1547104
0
0
191680
0
0.991219
0.593051
NaN
NaN
143
2
762.5
3615.25
37633.3
3.38661
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
3472.25
219.231
10849.2
640291
7707.14
0
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
62
0.000320616
640291
11.2769
26101.1
1.30867e+07
6.81266e+08
quoted_status_user_followers_count
NUM
NaN
NaN
1698
1243
False
1547104
7.03267e+06
0
191680
0
0.991219
0.732038
NaN
NaN
654.25
87
3406.5
39719
784927
6.56714
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
39064.8
385.685
566569
5.9921e+07
335108
7
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
0
0
5.9921e+07
16.929
2.2007e+06
5.69013e+08
4.84308e+12
quoted_status_user_friends_count
NUM
NaN
NaN
1698
934
False
1547104
1
0
191680
0
0.991219
0.550059
NaN
NaN
194
7
482
1482.25
11083.6
3.82085
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
1288.25
43.7724
5600.18
181261
3878.3
0
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
30
0.000155137
181261
6.18651
14818.4
6.58536e+06
2.19586e+08
quoted_status_user_id
NUM
NaN
NaN
1698
1162
False
1547104
3.14878e+09
0
191680
0
0.991219
0.684335
NaN
NaN
2.94043e+07
6.2897e+06
2.80974e+08
1.97674e+09
4.40222e+09
5.46786
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
1.94733e+09
26.0156
4.48007e+16
7.32241e+17
2.31502e+16
4816
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
0
0
7.32241e+17
5.28935
1.26582e+17
3.9309e+19
1.6023e+34
quoted_status_user_id_str
CORR
quoted_status_user_id
1
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
quoted_status_user_listed_count
NUM
NaN
NaN
1698
639
False
1547104
0
0
191680
0
0.991219
0.376325
NaN
NaN
29
2
143.5
815.25
6522.8
4.95122
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
786.25
48.7525
5387.76
173929
3456.34
0
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
54
0.000279246
173929
6.95377
17113.1
5.86886e+06
2.92858e+08
quoted_status_user_statuses_count
NUM
NaN
NaN
1698
1287
False
1547104
193095
0
191680
0
0.991219
0.757951
NaN
NaN
1687.25
136.4
5939
24840.8
140758
1.86495
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
23153.5
10.9608
31411.6
354746
26693.1
4
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
0
0
354742
3.10405
49781.3
4.53249e+07
2.47818e+09
quoted_status_user_utc_offset
NUM
NaN
NaN
1406
24
False
1547104
-25200
0
191972
0
0.992729
0.0170697
NaN
NaN
-25200
-25200
-14400
3600
28800
-1.97935
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
28800
0.146581
14250.3
43200
-8585.21
-39600
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
27
0.000139623
82800
0.940753
16993.1
-1.20708e+07
2.88767e+08
retweet_count
NUM
NaN
NaN
193378
837
False
1547104
0
0
0
0
0
0.00432831
NaN
NaN
0
0
0
4
102
16.381
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
4
12541.7
92.4298
166648
53.5678
0
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
99109
0.512514
166648
92.0421
877.497
10358843
770002
retweeted_status_favorite_count
CORR
retweet_count
0.91551
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
retweeted_status_id
NUM
NaN
NaN
69423
21016
False
1547104
7.33324e+17
0
123955
0
0.640998
0.302724
NaN
NaN
7.23781e+17
7.21277e+17
7.26874e+17
7.3002e+17
7.3364e+17
0.0400327
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
6.23958e+15
322.608
6.15579e+15
7.34562e+17
7.24606e+17
1.1406e+09
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
0
0
7.34562e+17
-16.7698
2.90079e+16
5.03043e+22
8.41459e+32
retweeted_status_id_str
CORR
retweeted_status_id
1
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
retweeted_status_in_reply_to_status_id
CORR
retweeted_status_id_str
0.994562
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
retweeted_status_in_reply_to_status_id_str
CORR
retweeted_status_in_reply_to_status_id
1
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
retweeted_status_in_reply_to_user_id
NUM
NaN
NaN
2802
1271
False
1547104
3.04172e+07
0
190576
0
0.98551
0.453605
NaN
NaN
3.04172e+07
1.29689e+07
2.17203e+08
9.00048e+08
3.28169e+09
8.01208
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
8.69631e+08
60.319
2.16889e+16
7.27333e+17
1.10135e+16
5339
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
0
0
7.27333e+17
7.89102
8.8241e+16
3.08598e+19
7.78647e+33
retweeted_status_in_reply_to_user_id_str
CORR
retweeted_status_in_reply_to_user_id
1
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
retweeted_status_quoted_status_favorite_count
NUM
NaN
NaN
2162
132
False
1547104
114
0
191216
0
0.98882
0.0610546
NaN
NaN
6
0
114
114
220
23.2108
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
108
1989.7
434.843
311618
294.716
0
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
148
0.00076534
311618
43.9257
6840.59
637176
4.67937e+07
retweeted_status_quoted_status_id
CONST
NaN
NaN
0
1
False
1547104
NaN
0
193378
0
1
inf
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
retweeted_status_quoted_status_id_str
CONST
NaN
NaN
0
1
False
1547104
NaN
0
193378
0
1
inf
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
retweeted_status_quoted_status_retweet_count
CORR
retweeted_status_quoted_status_favorite_count
0.998323
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
retweeted_status_quoted_status_user_favourites_count
NUM
NaN
NaN
1071
347
False
1547104
0
0
192307
0
0.994462
0.323996
NaN
NaN
71
0
388
3088.5
17000
3.07893
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
3017.5
168.397
5802.46
269482
4255.13
0
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
117
0.000605033
269482
10.2363
13101.3
4.55724e+06
1.71643e+08
retweeted_status_quoted_status_user_followers_count
CORR
retweeted_status_favorite_count
0.994986
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
retweeted_status_quoted_status_user_friends_count
NUM
NaN
NaN
1071
335
False
1547104
23
0
192307
0
0.994462
0.312792
NaN
NaN
110
23
638
1392
15806
3.74559
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
1282
36.6869
5096.08
96189
3565.98
0
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
18
9.30819e-05
96189
5.9608
13356.7
3.81917e+06
1.78402e+08
retweeted_status_quoted_status_user_id
NUM
NaN
NaN
1071
341
False
1547104
3.70345e+07
0
192307
0
0.994462
0.318394
NaN
NaN
1.84834e+07
2.31156e+06
8.79152e+07
8.30324e+08
7.08559e+17
3.9042
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
8.11841e+08
11.3568
8.22746e+16
7.28733e+17
4.38388e+16
5339
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
0
0
7.28733e+17
3.65159
1.71156e+17
4.69514e+19
2.92942e+34
retweeted_status_quoted_status_user_id_str
CORR
retweeted_status_quoted_status_user_id
1
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
retweeted_status_quoted_status_user_listed_count
NUM
NaN
NaN
1071
265
False
1547104
10429
0
192307
0
0.994462
0.247432
NaN
NaN
45
3.5
201
849
10429
1.88835
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
804
3.36035
2384.45
21608
1733.96
0
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
12
6.20546e-05
21608
2.0779
3274.31
1.85707e+06
1.07211e+07
retweeted_status_quoted_status_user_statuses_count
NUM
NaN
NaN
1071
394
False
1547104
259036
0
192307
0
0.994462
0.36788
NaN
NaN
1324
137
9655
30485
259036
1.8521
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
29161
3.10496
55303.7
354746
43782.9
4
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
0
0
354742
2.15197
81090.3
4.68914e+07
6.57563e+09
retweeted_status_quoted_status_user_utc_offset
NUM
NaN
NaN
969
19
False
1547104
-25200
0
192409
0
0.994989
0.0196078
NaN
NaN
-21600
-25200
-14400
7200
19800
-3.07134
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
28800
-0.867897
15921.1
43200
-5814.24
-36000
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
8
4.13698e-05
79200
0.441733
17857.5
-5.634e+06
3.18891e+08
retweeted_status_retweet_count
CORR
retweeted_status_quoted_status_user_followers_...
0.996464
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
retweeted_status_user_favourites_count
NUM
NaN
NaN
69423
4678
False
1547104
0
0
123955
0
0.640998
0.067384
NaN
NaN
60
0
532
3432
34897
3.01722
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
3372
71.0348
10788.7
424498
7552.07
0
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
6745
0.0348799
424498
6.72324
22786.3
5.24288e+08
5.19215e+08
retweeted_status_user_followers_count
NUM
NaN
NaN
69423
6635
False
1547104
699897
0
123955
0
0.640998
0.0955735
NaN
NaN
807.5
72
4029
34659
699884
6.12235
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
33851.5
457.858
223830
3.85075e+07
142060
0
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
18
9.30819e-05
3.85075e+07
17.2514
869741
9.86224e+09
7.56449e+11
retweeted_status_user_friends_count
NUM
NaN
NaN
69423
3698
False
1547104
0
0
123955
0
0.640998
0.0532676
NaN
NaN
188
5
608
2749
69682
4.91855
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
2561
10652.6
10431.9
4.71732e+06
7067.45
0
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
1890
0.0097736
4.71732e+06
83.1461
34761.6
4.90643e+08
1.20837e+09
retweeted_status_user_id
NUM
NaN
NaN
69423
10376
False
1547104
3.04175e+07
0
123955
0
0.640998
0.149461
NaN
NaN
3.70534e+07
1.41749e+07
3.59035e+08
2.29318e+09
4.8208e+09
4.71808
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
2.25613e+09
18.3147
5.8856e+16
7.34092e+17
3.07502e+16
22
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
0
0
7.34092e+17
4.50679
1.45082e+17
2.13477e+21
2.10488e+34
retweeted_status_user_id_str
CORR
retweeted_status_user_id
1
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
retweeted_status_user_listed_count
NUM
NaN
NaN
69423
1835
False
1547104
797
0
123955
0
0.640998
0.0264322
NaN
NaN
61
4
241
845
4420
4.09158
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
784
288.735
1654.3
173930
1311.71
0
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
1148
0.00593656
173930
14.8771
5366.95
9.10627e+07
2.88042e+07
retweeted_status_user_statuses_count
NUM
NaN
NaN
69423
9782
False
1547104
4749
0
123955
0
0.640998
0.140904
NaN
NaN
1912
130
6757
25271
97449.1
2.21142
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
23359
80.9361
27625
1.27755e+06
24416.8
1
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
0
0
1.27755e+06
6.97158
53995.7
1.69508e+09
2.91553e+09
retweeted_status_user_utc_offset
NUM
NaN
NaN
53755
31
False
1547104
-14400
0
139623
0
0.722021
0.000576691
NaN
NaN
-25200
-25200
-14400
3600
19800
-1.74343
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
28800
-0.0880758
13368.7
46800
-8987.9
-39600
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
165
0.000853251
86400
0.825304
15669.8
-4.83144e+08
2.45543e+08
text
CAT
NaN
NaN
193378
130674
False
1547104
RT @daniel_bilar: Fake participation in conf c...
0
0
0
0
0.675744
RT @daniel_bilar: Fake participation in conf c...
1049
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
user_favourites_count
NUM
NaN
NaN
193378
14432
False
1547104
0
0
0
0
0
0.074631
NaN
NaN
1
0
73
1020.75
13144
4.2731
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
1019.75
249.705
4522.48
673894
2981.35
0
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
44441
0.229814
673894
11.6228
12739.6
576528032
1.62298e+08
user_followers_count
NUM
NaN
NaN
193378
10069
False
1547104
249
0
0
0
0
0.052069
NaN
NaN
152
25
448
1141
8835
17.2584
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
989
14135.4
5023.65
10383944
3443.86
0
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
162
0.000837737
10383944
108.909
59435.5
665966105
3.53257e+09
user_friends_count
NUM
NaN
NaN
193378
6384
False
1547104
0
0
0
0
0
0.0330131
NaN
NaN
83
0
343
1090
3945
4.09485
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
1007
572.551
1725.22
382464
1428.3
-5
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
10204
0.0527671
382469
17.8992
5848.68
276202101
3.4207e+07
user_id
NUM
NaN
NaN
193378
66076
False
1547104
4638112776
0
0
0
0
0.341693
NaN
NaN
1.98668e+08
1.59798e+07
1.49294e+09
3.33342e+09
7.1329e+17
3.39262
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
3.13475e+09
7.60179
1.05341e+17
734220480727683079
5.7247e+16
22
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
0
0
734220480727683057
3.09837
1.94217e+17
2265287164665850537
3.77204e+34
user_id_str
CORR
user_id
1
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
user_listed_count
NUM
NaN
NaN
193378
2703
False
1547104
0
0
0
0
0
0.0139778
NaN
NaN
16
1
78
248
1304.15
3.18321
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
232
1348.17
436.975
129229
353.926
0
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
6974
0.0360641
129229
19.3917
1126.62
68441453
1.26927e+06
user_statuses_count
NUM
NaN
NaN
193378
37874
False
1547104
11545
0
0
0
0
0.195855
NaN
NaN
1887
141
11136
57997
280665
2.25411
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
56110
26.9205
76620.4
2537204
61341.8
1
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
0
0
2537203
4.49368
138271
11862150564
1.91189e+10
user_utc_offset
NUM
NaN
NaN
119043
32
False
1547104
-25200
0
74335
0
0.384403
0.00026881
NaN
NaN
-25200
-25200
-14400
7200
28800
-2.82697
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
32400
-0.726381
15957.5
46800
-6377.12
-39600
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...
686
0.00354746
86400
0.586578
18028
-7.59152e+08
3.25007e+08
In [8]:
desc['table']
Out[8]:
{'CAT': 2,
'CONST': 4,
'CORR': 17,
'DATE': 0,
'NUM': 38,
'REJECTED': 21,
'UNIQUE': 0,
'memsize': '90.0 MiB',
'n': 193378,
'n_duplicates': 447,
'nvar': 61,
'recordsize': '488.0 B',
'total_missing': 0.49723492373469169}
In [9]:
# desc.keys()
html = pandas_profiling.to_html(df.head(), desc).encode('utf8')
with open('report.html', 'w') as fout:
fout.write(html)
display(HTML(html))
# report = pandas_profiling.ProfileReport(df)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-9-e3c1a3e88899> in <module>()
2 html = pandas_profiling.to_html(df.head(), desc).encode('utf8')
3 with open('report.html', 'w') as fout:
----> 4 fout.write(html)
5 display(HTML(html))
6 # report = pandas_profiling.ProfileReport(df)
TypeError: write() argument must be str, not bytes
In [ ]:
Content source: totalgood/twip
Similar notebooks: