In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/M4Comp.csv.gz')

In [3]:
df.head()


Out[3]:
K ID TYPE H PERIOD UNITS N PAST FUTURE
0 SERIES FIN1 FINANCE 8 MONTHLY NaN 170 87.33542282,86.10394147,84.98490317,84.4860577... 85.00849262,82.86804888,83.14415073,83.1855927...
1 SERIES FIN2 FINANCE 14 MONTHLY NaN 331 111.9,109.8,109.5,108.8,117.9,122.8,120.9,119.... 285.9,288.9,283.2,293.4,291.2,301,302.1,317.2,...
2 SERIES FIN3 FINANCE 14 MONTHLY NaN 331 91.9,89.5,89.8,91.5,99.1,102.9,100.5,99.1,102.... 96.4,96.7,94.5,97,96.3,98.6,98.8,102.8,100.7,9...
3 SERIES FIN4 FINANCE 8 MONTHLY NaN 178 3.949,3.793,3.611,3.597,3.51,3.327,3.482,3.446... 1.48,1.718,1.52,1.334,1.448,1.173,1.276,1.406
4 SERIES FIN5 FINANCE 8 MONTHLY NaN 178 4.39,4.14,3.828,3.871,3.811,3.859,4.04,3.939,3... 1.624,1.662,1.737,1.591,1.629,1.592,1.461,1.509

In [4]:
df.PERIOD.value_counts()


Out[4]:
MONTHLY       3750
YEARLY        2240
WEEKLY        1833
DAILY         1762
QUARTERLY      402
BIANNUALLY      13
Name: PERIOD, dtype: int64

In [5]:
df.TYPE.value_counts()


Out[5]:
ECONOMICS            2000
FINANCE              2000
INVENTORY            1500
BUSINESS-INDUSTRY    1500
DEMOGRAPHICS         1000
CLIMATE              1000
INTERNET-TELECOM     1000
Name: TYPE, dtype: int64

In [6]:
lDict = df.TYPE.value_counts().to_dict()

In [9]:
for ty in lDict.keys():
    df[df.TYPE == ty].to_csv('../data/M4Comp_' + ty + ".csv")

In [9]:
df[df.H > 100].head()


Out[9]:
K ID TYPE H PERIOD UNITS N PAST FUTURE
26 SERIES FIN27 FINANCE 127 DAILY NaN 3514 0.844,0.845,0.845,0.843,0.84,0.844,0.847,0.851... 2.3752,2.375,2.3727,2.3975,2.3291,2.313,2.323,...
27 SERIES FIN28 FINANCE 328 DAILY NaN 9136 1.0109,1.0102,1.0106,1.0148,1.0154,1.0159,1.01... 1.0854,1.0816,1.0841,1.0794,1.0719,1.0753,1.07...
28 SERIES FIN29 FINANCE 242 DAILY NaN 6742 1.5341,1.5418,1.5264,1.5264,1.5264,1.5371,1.53... 7.2115,7.195,7.1965,7.1912,7.1818,7.1845,7.191...

In [ ]: