In [1]:
import datapot as dp
datapot = dp.DataPot()
In [2]:
f = open('../data/job.jsonlines', 'r')
In [3]:
datapot.detect(f, limit=100)
Out[3]:
DataPot class instance
- number of features without transformation: 9
- number of new features: Unknown
features to transform:
('Id', [NumericTransformer])
('FullDescription', [TfidfTransformer])
('ContractType', [SVDOneHotTransformer])
('ContractTime', [SVDOneHotTransformer])
('Company', [SVDOneHotTransformer])
('Category', [SVDOneHotTransformer])
('SalaryNormalized', [NumericTransformer])
In [4]:
datapot.fit(f)
Out[4]:
DataPot class instance
- number of features without transformation: 9
- number of new features: 82
features to transform:
('Id', [NumericTransformer])
('FullDescription', [TfidfTransformer])
('ContractType', [SVDOneHotTransformer])
('ContractTime', [SVDOneHotTransformer])
('Company', [SVDOneHotTransformer])
('Category', [SVDOneHotTransformer])
('SalaryNormalized', [NumericTransformer])
In [5]:
df = datapot.transform(f)
/usr/local/lib/python3.6/site-packages/datapot/__init__.py:137: FutureWarning: convert_objects is deprecated. Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
return pd.DataFrame(data=np.hstack(columns), columns=names).convert_objects(convert_numeric=True)
In [6]:
datapot
Out[6]:
DataPot class instance
- number of features without transformation: 9
- number of new features: 82
features to transform:
('Id', [NumericTransformer])
('FullDescription', [TfidfTransformer])
('ContractType', [SVDOneHotTransformer])
('ContractTime', [SVDOneHotTransformer])
('Company', [SVDOneHotTransformer])
('Category', [SVDOneHotTransformer])
('SalaryNormalized', [NumericTransformer])
In [7]:
df
Out[7]:
Id
FullDescription_0
FullDescription_1
FullDescription_2
FullDescription_3
FullDescription_4
FullDescription_5
FullDescription_6
FullDescription_7
FullDescription_8
...
Category_Sales Jobs
Category_Manufacturing Jobs
Category_Teaching Jobs
Category_Creative & Design Jobs
Category_Trade & Construction Jobs
Category_Property Jobs
Category_Admin Jobs
Category_Legal Jobs
Category_Retail Jobs
SalaryNormalized
0
12612628.0
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.150034
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
25000.0
1
12612830.0
0.013096
0.000000
0.000000
0.007217
0.010781
0.016546
0.000000
0.221615
0.016934
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
30000.0
2
12612844.0
0.040378
0.000186
0.000000
0.000000
0.003483
0.000265
0.000000
0.097930
0.011775
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
30000.0
3
12613049.0
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.142747
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
27500.0
4
12613647.0
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.116753
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
25000.0
5
13179816.0
0.000000
0.000000
0.000000
0.004854
0.000000
0.000000
0.000000
0.242607
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
25000.0
6
14131336.0
0.000231
0.000000
0.006631
0.005181
0.000000
0.001546
0.000631
0.277847
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
75000.0
7
14663196.0
0.051913
0.008790
0.008140
0.000000
0.007446
0.029638
0.000000
0.130854
0.000577
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
22000.0
8
14663197.0
0.047292
0.003398
0.000000
0.013567
0.007466
0.024149
0.000000
0.116179
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
23000.0
9
15395797.0
0.022033
0.005670
0.001950
0.000506
0.000000
0.000000
0.019252
0.177728
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
85000.0
10
19047429.0
0.091929
0.000000
0.011103
0.000000
0.008910
0.012707
0.000000
0.037144
0.000528
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
21000.0
11
20199757.0
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.254383
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
37500.0
12
20638787.0
0.012204
0.000000
0.000000
0.004274
0.000000
0.000000
0.000000
0.222288
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
45000.0
13
20638788.0
0.000000
0.000530
0.000000
0.000000
0.004215
0.000000
0.000000
0.074851
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
40000.0
14
20797143.0
0.012540
0.000335
0.000000
0.004406
0.001646
0.221597
0.000000
0.000000
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
16000.0
15
22579462.0
0.068486
0.000000
0.000000
0.000000
0.000000
0.002315
0.000000
0.118653
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
22000.0
16
22581547.0
0.000000
0.000000
0.000000
0.004133
0.000000
0.004486
0.000000
0.245055
0.003701
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
85000.0
17
22933091.0
0.008647
0.000000
0.000000
0.002674
0.000000
0.235507
0.000000
0.000000
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
18000.0
18
23528672.0
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.070164
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
39500.0
19
23529949.0
0.038935
0.000000
0.000000
0.008941
0.003949
0.000000
0.000000
0.115597
0.012179
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
33500.0
20
23530231.0
0.014189
0.009878
0.000000
0.000000
0.000571
0.003199
0.000000
0.219456
0.008926
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
39500.0
21
24104334.0
0.033130
0.000000
0.000000
0.004448
0.006484
0.162254
0.000000
0.014432
0.014735
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
17280.0
22
24835524.0
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.347702
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
41500.0
23
24835548.0
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.341902
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
31500.0
24
24854800.0
0.008224
0.000000
0.000000
0.008006
0.000000
0.001411
0.000571
0.301495
0.003117
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
32500.0
25
25337172.0
0.027901
0.000000
0.000000
0.000000
0.005098
0.216557
0.003129
0.000000
0.003417
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
24000.0
26
25434785.0
0.000000
0.000000
0.000000
0.002710
0.000000
0.000000
0.000000
0.370478
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
41500.0
27
25452606.0
0.000000
0.000000
0.000000
0.004873
0.000000
0.271060
0.000000
0.000000
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
16000.0
28
25452680.0
0.025028
0.000000
0.000000
0.000000
0.002380
0.235892
0.000000
0.000000
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
26000.0
29
25556432.0
0.035556
0.057093
0.000000
0.001779
0.006689
0.054215
0.000000
0.035470
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
31500.0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
1970
51294422.0
0.025535
0.032397
0.000000
0.008079
0.012227
0.060040
0.000000
0.005375
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
27500.0
1971
51294479.0
0.026788
0.005313
0.000000
0.007259
0.036357
0.074193
0.000000
0.026738
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
16000.0
1972
51294626.0
0.011934
0.005105
0.002004
0.000000
0.000000
0.248861
0.000000
0.000000
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
16000.0
1973
51294848.0
0.001436
0.000000
0.000000
0.006389
0.000000
0.236870
0.003315
0.000000
0.005304
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
16000.0
1974
51327246.0
0.006024
0.000102
0.000000
0.011931
0.000000
0.012180
0.013726
0.027131
0.009898
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
21120.0
1975
51539834.0
0.071009
0.000000
0.000000
0.000000
0.007323
0.001134
0.000000
0.100355
0.007914
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
30000.0
1976
51539851.0
0.066011
0.000000
0.000000
0.001116
0.009254
0.000000
0.000000
0.103326
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
27500.0
1977
51590339.0
0.045721
0.004107
0.032969
0.000000
0.026270
0.001185
0.018264
0.003706
0.066904
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
14880.0
1978
51590514.0
0.044216
0.004087
0.033145
0.000000
0.027006
0.001269
0.016508
0.004534
0.066968
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
14880.0
1979
51590517.0
0.046238
0.003812
0.034015
0.000000
0.027371
0.000750
0.018631
0.004310
0.067116
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
14880.0
1980
51590518.0
0.045477
0.003784
0.033448
0.000000
0.026942
0.000686
0.018379
0.004249
0.067270
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
13000.0
1981
51590521.0
0.045477
0.003784
0.033448
0.000000
0.026942
0.000686
0.018379
0.004249
0.067270
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
14880.0
1982
51590547.0
0.011968
0.000000
0.000259
0.000239
0.000000
0.007117
0.009627
0.153438
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
27500.0
1983
51593358.0
0.019674
0.000000
0.002067
0.051605
0.002448
0.011175
0.038570
0.021459
0.014555
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
28500.0
1984
51593375.0
0.009798
0.000000
0.031407
0.021080
0.027400
0.000000
0.019046
0.000000
0.041672
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
8160.0
1985
51593376.0
0.009033
0.000000
0.028957
0.019436
0.025262
0.000000
0.017560
0.000000
0.038421
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
8160.0
1986
51593378.0
0.008206
0.000000
0.029258
0.019778
0.025537
0.000000
0.017865
0.000000
0.039089
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
8160.0
1987
51593379.0
0.008907
0.000000
0.029192
0.019513
0.025574
0.000000
0.017717
0.000000
0.038596
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
17280.0
1988
51593381.0
0.006742
0.049407
0.003259
0.010983
0.016176
0.009576
0.000000
0.010763
0.021938
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
17280.0
1989
51593387.0
0.039204
0.057453
0.005924
0.005894
0.000000
0.000000
0.015855
0.005436
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
57500.0
1990
51593391.0
0.048332
0.000000
0.014931
0.003997
0.000000
0.017977
0.045267
0.027232
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
20000.0
1991
51593402.0
0.075834
0.000000
0.000000
0.001428
0.012600
0.012524
0.000000
0.010915
0.120404
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
38400.0
1992
51593410.0
0.060365
0.000000
0.002623
0.000000
0.000000
0.003392
0.030380
0.010786
0.000000
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
19200.0
1993
51593411.0
0.070065
0.000000
0.000000
0.007476
0.009776
0.017095
0.000000
0.004580
0.126852
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
54720.0
1994
51593412.0
0.073299
0.000000
0.000000
0.000000
0.000000
0.000000
0.001232
0.000000
0.074945
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
49920.0
1995
51593413.0
0.072031
0.001955
0.001990
0.005146
0.006916
0.016816
0.000000
0.008248
0.131446
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
60480.0
1996
51593416.0
0.075554
0.071577
0.008496
0.000460
0.033244
0.018337
0.000000
0.034091
0.008897
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
26000.0
1997
51593418.0
0.062593
0.020644
0.035022
0.000000
0.020641
0.000000
0.017177
0.000000
0.043776
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
17000.0
1998
51593420.0
0.070777
0.078777
0.009252
0.000000
0.015127
0.001476
0.147340
0.002519
0.012580
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
23500.0
1999
51593421.0
0.074181
0.000000
0.000000
0.027787
0.000000
0.005971
0.000000
0.000000
0.109913
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
53760.0
2000 rows × 82 columns
In [8]:
df.shape
Out[8]:
(2000, 82)
In [ ]:
Content source: bashalex/datapot
Similar notebooks: