In [1]:
import pandas as pd

import numpy as np

import random

import csv

In [2]:
df = pd.read_csv("USstockdata.csv")

In [ ]:


In [3]:
#removes rows with empty data and data containing strings and errors

df.dropna()


df = df[df.RET != "C"]
df = df[df.RET != "B"]

In [4]:
# converts data to proper formats, high precision floats and datetime

df["RET"] = df["RET"].astype(np.float64)
df["date"] = pd.to_datetime(df["date"])
df.set_index(df["date"])
df = df.set_index(pd.DatetimeIndex(df["date"]))

In [5]:
# df.set_index(df["Names Date"])
# df_m = df.groupby(["PERMNO", pd.TimeGrouper(freq='M')])["Returns"].sum()

#filters by count greater than 223

df =df.groupby( "PERMNO").filter( lambda x: len(x) >223)

In [28]:
df.head()


Out[28]:
PERMNO date RET number_of_values
1994-12-30 10001 1994-12-30 -0.033433 NaN
1995-01-31 10001 1995-01-31 -0.031250 NaN
1995-02-28 10001 1995-02-28 -0.026210 NaN
1995-03-31 10001 1995-03-31 0.006377 NaN
1995-04-28 10001 1995-04-28 0.000000 NaN

In [22]:
def sampling_function(number_in_sample, number_of_repeats):
    """
    Takes a random sample of stocks and calculates average monthly return for a group of them
    Data is stored in a csv file for further analysis
    Args:
    number_in_sample: number of different stocks
    number_of_repeats: times the function was run
    
    """
    lista = []
    for n in xrange(1, number_of_repeats):
        dfList = df["PERMNO"].tolist()
        rand_smpl = [ dfList[i] for i in sorted(random.sample(xrange(len(dfList)), number_in_sample)) ]
#         with open('filename.csv', 'a') as myfile:
#             wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
#             wr.writerow(rand_smpl)

        random_sample_df = df[df["PERMNO"].isin(rand_smpl)]
        series_sum_monthly = random_sample_df.groupby( pd.TimeGrouper(freq='M'))["RET"].sum()
        frame_sum_monthly = series_sum_monthly.to_frame()
        frame_sum_monthly["Ratio"] = frame_sum_monthly.RET/number_in_sample
        frame_sum_monthly["Variance"] = frame_sum_monthly.RET.var()
        
        pp = frame_sum_monthly["Variance"].iloc[0]
        


        with open('variance.csv', 'a') as myfile:
            wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
            wr.writerow([pp])
        frame_sum_monthly.to_csv('filename.csv',mode = 'a',header ='column_names')
        
        lista.append(float(pp))
        
    rezultat = sum(lista)/float(number_in_sample)
        
    return rezultat

In [33]:
#example of a function call

listoo = list(range(df["PERMNO"].nunique()))
listoo.pop(0)


Out[33]:
0

In [34]:
for n in listoo:
    sampling_function(n, 1000)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-34-3628ba65e7ce> in <module>()
      1 
      2 for n in listoo:
----> 3     sampling_function(n, 1000)

<ipython-input-22-5e90c4c217d2> in sampling_function(number_in_sample, number_of_repeats)
     12     lista = []
     13     for n in xrange(1, number_of_repeats):
---> 14         dfList = df["PERMNO"].tolist()
     15         rand_smpl = [ dfList[i] for i in sorted(random.sample(xrange(len(dfList)), number_in_sample)) ]
     16 #         with open('filename.csv', 'a') as myfile:

/home/tric/anaconda2/lib/python2.7/site-packages/pandas/core/series.pyc in tolist(self)
   1052     def tolist(self):
   1053         """ Convert Series to a nested list """
-> 1054         return list(self)
   1055 
   1056     def to_dict(self):

KeyboardInterrupt: 

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: