In [2]:
import pandas as pd
import numpy as np

In [55]:
dsf = pd.read_csv("dsf.csv", dtype = {'CUSIP': np.str, 'PRC': np.float}, na_values = {'PRC': '-'})
# siccodes = pd.read_csv("siccodes.csv")
dsf.shape


Out[55]:
(2568230, 20)

In [61]:
siccodes = pd.read_table("sic_codes.txt", header = 1)
siccodes.columns = ['HSICCD', 'SICNAME']
siccodes.head(50)


Out[61]:
HSICCD SICNAME
0 100 Agricultural Production Crops
1 110 Cash Grains
2 111 Wheat
3 112 Rice
4 115 Corn
5 116 Soybeans
6 119 Cash Grains, Not Elsewhere Classified
7 130 Field Crops, Except Cash Grains
8 131 Cotton
9 132 Tobacco
10 133 Sugarcane and Sugar Beets
11 134 Irish Potatoes
12 139 Field Crops, Except Cash Grains, Not Elsewhere...
13 160 Vegetables And Melons
14 161 Vegetables and Melons
15 170 Fruits And Tree Nuts
16 171 Berry Crops
17 172 Grapes
18 173 Tree Nuts
19 174 Citrus Fruits
20 175 Deciduous Tree Fruits
21 179 Fruits and Tree Nuts, Not Elsewhere Classified
22 180 Horticultural Specialties
23 181 Ornamental Floriculture and Nursery Products
24 182 Food Crops Grown Under Cover
25 190 General Farms, Primarily Crop
26 191 General Farms, Primarily Crop
27 200 Agriculture production livestock and animal sp...
28 210 Livestock, Except Dairy And Poultry
29 211 Beef Cattle Feedlots
30 212 Beef Cattle, Except Feedlots
31 213 Hogs
32 214 Sheep and Goats
33 219 General Livestock, Except Dairy and Poultry
34 240 Dairy Farms
35 241 Dairy Farms
36 250 Poultry And Eggs
37 251 Broiler, Fryer, and Roaster Chickens
38 252 Chicken Eggs
39 253 Turkeys and Turkey Eggs
40 254 Poultry Hatcheries
41 259 Poultry and Eggs, Not Elsewhere Classified
42 270 Animal Specialties
43 271 Fur-Bearing Animals and Rabbits
44 272 Horses and Other Equines
45 273 Animal Aquaculture
46 279 Animal Specialties, Not Elsewhere Classified
47 290 General Farms, Primarily Livestock And Animal
48 291 General Farms, Primarily Livestock and Animal ...
49 700 Agricultural Services

In [57]:
dsf = dsf[dsf.DATE >= 20100101]
len(dsf.HSICCD.unique())


Out[57]:
60

In [58]:
keep_levels = ['PERMNO', 'DATE', 'PRC', 'VOL', 'SHROUT', 'RET', 'HSICCD']
dsf = dsf[keep_levels]

In [66]:
clean_dsf = dsf.merge(siccodes, how = 'left')

In [67]:
clean_dsf.head()


Out[67]:
PERMNO DATE PRC VOL SHROUT RET HSICCD SICNAME
0 10001 20100104 10.25 18500 4361 -0.004854 4925 Mixed, Manufactured, or Liquefied Petroleum Ga...
1 10001 20100105 10.19 23200 4361 -0.005854 4925 Mixed, Manufactured, or Liquefied Petroleum Ga...
2 10001 20100106 10.31 18700 4361 0.011776 4925 Mixed, Manufactured, or Liquefied Petroleum Ga...
3 10001 20100107 9.96 29200 4361 -0.033948 4925 Mixed, Manufactured, or Liquefied Petroleum Ga...
4 10001 20100108 10.34 25100 4361 0.038153 4925 Mixed, Manufactured, or Liquefied Petroleum Ga...