In [1]:
import pandas as pd

import numpy as np

import random

import csv

In [2]:
df = pd.read_csv("USstockdata.csv")

In [ ]:


In [4]:
#removes rows with empty data and data containing strings and errors

df.dropna()


df = df[df.RET != "C"]
df = df[df.RET != "B"]

In [5]:
# converts data to proper formats, high precision floats and datetime

df["RET"] = df["RET"].astype(np.float64)
df["date"] = pd.to_datetime(df["date"])
df.set_index(df["date"])
df = df.set_index(pd.DatetimeIndex(df["date"]))

In [40]:
# df.set_index(df["Names Date"])
# df_m = df.groupby(["PERMNO", pd.TimeGrouper(freq='M')])["Returns"].sum()

#filters by count greater than 223

df =df.groupby( "PERMNO").filter( lambda x: len(x) >223)


Out[40]:
PERMNO date RET number_of_values
1994-12-30 10001 1994-12-30 -0.033433 NaN
1995-01-31 10001 1995-01-31 -0.031250 NaN
1995-02-28 10001 1995-02-28 -0.026210 NaN
1995-03-31 10001 1995-03-31 0.006377 NaN
1995-04-28 10001 1995-04-28 0.000000 NaN

In [28]:
df.head()


Out[28]:
PERMNO date RET number_of_values
1994-12-30 10001 1994-12-30 -0.033433 NaN
1995-01-31 10001 1995-01-31 -0.031250 NaN
1995-02-28 10001 1995-02-28 -0.026210 NaN
1995-03-31 10001 1995-03-31 0.006377 NaN
1995-04-28 10001 1995-04-28 0.000000 NaN

In [43]:
def sampling_function(number_in_sample, number_of_repeats):
    """
    Takes a random sample of stocks and calculates average monthly return for a group of them
    Data is stored in a csv file for further analysis
    Args:
    number_in_sample: number of different stocks
    number_of_repeats: times the function was run
    """
    for n in xrange(1, number_of_repeats):
        dfList = df["PERMNO"].tolist()
        rand_smpl = [ dfList[i] for i in sorted(random.sample(xrange(len(dfList)), number_in_sample)) ]
#         with open('filename.csv', 'a') as myfile:
#             wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
#             wr.writerow(rand_smpl)

        random_sample_df = df[df["PERMNO"].isin(rand_smpl)]
        series_sum_monthly = random_sample_df.groupby( pd.TimeGrouper(freq='M'))["RET"].sum()
        frame_sum_monthly = series_sum_monthly.to_frame()
        frame_sum_monthly["Ratio"] = frame_sum_monthly.RET/number_in_sample
        frame_sum_monthly["Variance"] = frame_sum_monthly.RET.var()
        
        pp = frame_sum_monthly["Variance"].iloc[0]
        


        with open('variance.csv', 'a') as myfile:
            wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
            wr.writerow([pp])
        frame_sum_monthly.to_csv('filename.csv',mode = 'a',header ='column_names')
        
    lista = lista.append(pp)
        
    rezultat = sum(lista)/number_in_sample
        
    return rezultat

In [44]:
#example of a function call
b = sampling_function(1, 100)

In [42]:
for n in xrange(df["PERMNO"].nunique())
    sampling_function(n, 1000)


Out[42]:
2238

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: