Datapot README Example


In [1]:
import datapot as dp 
datapot = dp.DataPot()

In [2]:
f = open('../data/job.jsonlines', 'r')

In [3]:
datapot.detect(f, limit=100)


Out[3]:
DataPot class instance
 - number of features without transformation: 9
 - number of new features: Unknown
features to transform: 
	('Id', [NumericTransformer])
	('FullDescription', [TfidfTransformer])
	('ContractType', [SVDOneHotTransformer])
	('ContractTime', [SVDOneHotTransformer])
	('Company', [SVDOneHotTransformer])
	('Category', [SVDOneHotTransformer])
	('SalaryNormalized', [NumericTransformer])

In [4]:
datapot.fit(f)


Out[4]:
DataPot class instance
 - number of features without transformation: 9
 - number of new features: 82
features to transform: 
	('Id', [NumericTransformer])
	('FullDescription', [TfidfTransformer])
	('ContractType', [SVDOneHotTransformer])
	('ContractTime', [SVDOneHotTransformer])
	('Company', [SVDOneHotTransformer])
	('Category', [SVDOneHotTransformer])
	('SalaryNormalized', [NumericTransformer])

In [5]:
df = datapot.transform(f)


/usr/local/lib/python3.6/site-packages/datapot/__init__.py:137: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  return pd.DataFrame(data=np.hstack(columns), columns=names).convert_objects(convert_numeric=True)

In [6]:
datapot


Out[6]:
DataPot class instance
 - number of features without transformation: 9
 - number of new features: 82
features to transform: 
	('Id', [NumericTransformer])
	('FullDescription', [TfidfTransformer])
	('ContractType', [SVDOneHotTransformer])
	('ContractTime', [SVDOneHotTransformer])
	('Company', [SVDOneHotTransformer])
	('Category', [SVDOneHotTransformer])
	('SalaryNormalized', [NumericTransformer])

In [7]:
df


Out[7]:
Id FullDescription_0 FullDescription_1 FullDescription_2 FullDescription_3 FullDescription_4 FullDescription_5 FullDescription_6 FullDescription_7 FullDescription_8 ... Category_Sales Jobs Category_Manufacturing Jobs Category_Teaching Jobs Category_Creative & Design Jobs Category_Trade & Construction Jobs Category_Property Jobs Category_Admin Jobs Category_Legal Jobs Category_Retail Jobs SalaryNormalized
0 12612628.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.150034 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 25000.0
1 12612830.0 0.013096 0.000000 0.000000 0.007217 0.010781 0.016546 0.000000 0.221615 0.016934 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 30000.0
2 12612844.0 0.040378 0.000186 0.000000 0.000000 0.003483 0.000265 0.000000 0.097930 0.011775 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 30000.0
3 12613049.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.142747 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 27500.0
4 12613647.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.116753 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 25000.0
5 13179816.0 0.000000 0.000000 0.000000 0.004854 0.000000 0.000000 0.000000 0.242607 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 25000.0
6 14131336.0 0.000231 0.000000 0.006631 0.005181 0.000000 0.001546 0.000631 0.277847 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 75000.0
7 14663196.0 0.051913 0.008790 0.008140 0.000000 0.007446 0.029638 0.000000 0.130854 0.000577 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 22000.0
8 14663197.0 0.047292 0.003398 0.000000 0.013567 0.007466 0.024149 0.000000 0.116179 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 23000.0
9 15395797.0 0.022033 0.005670 0.001950 0.000506 0.000000 0.000000 0.019252 0.177728 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 85000.0
10 19047429.0 0.091929 0.000000 0.011103 0.000000 0.008910 0.012707 0.000000 0.037144 0.000528 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 21000.0
11 20199757.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.254383 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 37500.0
12 20638787.0 0.012204 0.000000 0.000000 0.004274 0.000000 0.000000 0.000000 0.222288 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 45000.0
13 20638788.0 0.000000 0.000530 0.000000 0.000000 0.004215 0.000000 0.000000 0.074851 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 40000.0
14 20797143.0 0.012540 0.000335 0.000000 0.004406 0.001646 0.221597 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 16000.0
15 22579462.0 0.068486 0.000000 0.000000 0.000000 0.000000 0.002315 0.000000 0.118653 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 22000.0
16 22581547.0 0.000000 0.000000 0.000000 0.004133 0.000000 0.004486 0.000000 0.245055 0.003701 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 85000.0
17 22933091.0 0.008647 0.000000 0.000000 0.002674 0.000000 0.235507 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 18000.0
18 23528672.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.070164 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 39500.0
19 23529949.0 0.038935 0.000000 0.000000 0.008941 0.003949 0.000000 0.000000 0.115597 0.012179 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 33500.0
20 23530231.0 0.014189 0.009878 0.000000 0.000000 0.000571 0.003199 0.000000 0.219456 0.008926 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 39500.0
21 24104334.0 0.033130 0.000000 0.000000 0.004448 0.006484 0.162254 0.000000 0.014432 0.014735 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 17280.0
22 24835524.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.347702 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 41500.0
23 24835548.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.341902 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 31500.0
24 24854800.0 0.008224 0.000000 0.000000 0.008006 0.000000 0.001411 0.000571 0.301495 0.003117 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 32500.0
25 25337172.0 0.027901 0.000000 0.000000 0.000000 0.005098 0.216557 0.003129 0.000000 0.003417 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 24000.0
26 25434785.0 0.000000 0.000000 0.000000 0.002710 0.000000 0.000000 0.000000 0.370478 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 41500.0
27 25452606.0 0.000000 0.000000 0.000000 0.004873 0.000000 0.271060 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 16000.0
28 25452680.0 0.025028 0.000000 0.000000 0.000000 0.002380 0.235892 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 26000.0
29 25556432.0 0.035556 0.057093 0.000000 0.001779 0.006689 0.054215 0.000000 0.035470 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 31500.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1970 51294422.0 0.025535 0.032397 0.000000 0.008079 0.012227 0.060040 0.000000 0.005375 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 27500.0
1971 51294479.0 0.026788 0.005313 0.000000 0.007259 0.036357 0.074193 0.000000 0.026738 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 16000.0
1972 51294626.0 0.011934 0.005105 0.002004 0.000000 0.000000 0.248861 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 16000.0
1973 51294848.0 0.001436 0.000000 0.000000 0.006389 0.000000 0.236870 0.003315 0.000000 0.005304 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 16000.0
1974 51327246.0 0.006024 0.000102 0.000000 0.011931 0.000000 0.012180 0.013726 0.027131 0.009898 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 21120.0
1975 51539834.0 0.071009 0.000000 0.000000 0.000000 0.007323 0.001134 0.000000 0.100355 0.007914 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 30000.0
1976 51539851.0 0.066011 0.000000 0.000000 0.001116 0.009254 0.000000 0.000000 0.103326 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 27500.0
1977 51590339.0 0.045721 0.004107 0.032969 0.000000 0.026270 0.001185 0.018264 0.003706 0.066904 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 14880.0
1978 51590514.0 0.044216 0.004087 0.033145 0.000000 0.027006 0.001269 0.016508 0.004534 0.066968 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 14880.0
1979 51590517.0 0.046238 0.003812 0.034015 0.000000 0.027371 0.000750 0.018631 0.004310 0.067116 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 14880.0
1980 51590518.0 0.045477 0.003784 0.033448 0.000000 0.026942 0.000686 0.018379 0.004249 0.067270 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 13000.0
1981 51590521.0 0.045477 0.003784 0.033448 0.000000 0.026942 0.000686 0.018379 0.004249 0.067270 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 14880.0
1982 51590547.0 0.011968 0.000000 0.000259 0.000239 0.000000 0.007117 0.009627 0.153438 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 27500.0
1983 51593358.0 0.019674 0.000000 0.002067 0.051605 0.002448 0.011175 0.038570 0.021459 0.014555 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 28500.0
1984 51593375.0 0.009798 0.000000 0.031407 0.021080 0.027400 0.000000 0.019046 0.000000 0.041672 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 8160.0
1985 51593376.0 0.009033 0.000000 0.028957 0.019436 0.025262 0.000000 0.017560 0.000000 0.038421 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 8160.0
1986 51593378.0 0.008206 0.000000 0.029258 0.019778 0.025537 0.000000 0.017865 0.000000 0.039089 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 8160.0
1987 51593379.0 0.008907 0.000000 0.029192 0.019513 0.025574 0.000000 0.017717 0.000000 0.038596 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 17280.0
1988 51593381.0 0.006742 0.049407 0.003259 0.010983 0.016176 0.009576 0.000000 0.010763 0.021938 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 17280.0
1989 51593387.0 0.039204 0.057453 0.005924 0.005894 0.000000 0.000000 0.015855 0.005436 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 57500.0
1990 51593391.0 0.048332 0.000000 0.014931 0.003997 0.000000 0.017977 0.045267 0.027232 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 20000.0
1991 51593402.0 0.075834 0.000000 0.000000 0.001428 0.012600 0.012524 0.000000 0.010915 0.120404 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 38400.0
1992 51593410.0 0.060365 0.000000 0.002623 0.000000 0.000000 0.003392 0.030380 0.010786 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 19200.0
1993 51593411.0 0.070065 0.000000 0.000000 0.007476 0.009776 0.017095 0.000000 0.004580 0.126852 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 54720.0
1994 51593412.0 0.073299 0.000000 0.000000 0.000000 0.000000 0.000000 0.001232 0.000000 0.074945 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 49920.0
1995 51593413.0 0.072031 0.001955 0.001990 0.005146 0.006916 0.016816 0.000000 0.008248 0.131446 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 60480.0
1996 51593416.0 0.075554 0.071577 0.008496 0.000460 0.033244 0.018337 0.000000 0.034091 0.008897 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 26000.0
1997 51593418.0 0.062593 0.020644 0.035022 0.000000 0.020641 0.000000 0.017177 0.000000 0.043776 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 17000.0
1998 51593420.0 0.070777 0.078777 0.009252 0.000000 0.015127 0.001476 0.147340 0.002519 0.012580 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 23500.0
1999 51593421.0 0.074181 0.000000 0.000000 0.027787 0.000000 0.005971 0.000000 0.000000 0.109913 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 53760.0

2000 rows × 82 columns


In [8]:
df.shape


Out[8]:
(2000, 82)

In [ ]: