notebook.community

Edit and run



In [1]:

    
import pandas as pd

import numpy as np

import random

import csv



In [2]:

    
df = pd.read_csv("USstockdata.csv")



In [ ]:



In [4]:

    
#removes rows with empty data and data containing strings and errors

df.dropna()


df = df[df.RET != "C"]
df = df[df.RET != "B"]



In [5]:

    
# converts data to proper formats, high precision floats and datetime

df["RET"] = df["RET"].astype(np.float64)
df["date"] = pd.to_datetime(df["date"])
df.set_index(df["date"])
df = df.set_index(pd.DatetimeIndex(df["date"]))



In [40]:

    
# df.set_index(df["Names Date"])
# df_m = df.groupby(["PERMNO", pd.TimeGrouper(freq='M')])["Returns"].sum()

#filters by count greater than 223

df =df.groupby( "PERMNO").filter( lambda x: len(x) >223)









    Out[40]:






  
    
      
      PERMNO
      date
      RET
      number_of_values
    
  
  
    
      1994-12-30
      10001
      1994-12-30
      -0.033433
      NaN
    
    
      1995-01-31
      10001
      1995-01-31
      -0.031250
      NaN
    
    
      1995-02-28
      10001
      1995-02-28
      -0.026210
      NaN
    
    
      1995-03-31
      10001
      1995-03-31
      0.006377
      NaN
    
    
      1995-04-28
      10001
      1995-04-28
      0.000000
      NaN



In [28]:

    
df.head()









    Out[28]:






  
    
      
      PERMNO
      date
      RET
      number_of_values
    
  
  
    
      1994-12-30
      10001
      1994-12-30
      -0.033433
      NaN
    
    
      1995-01-31
      10001
      1995-01-31
      -0.031250
      NaN
    
    
      1995-02-28
      10001
      1995-02-28
      -0.026210
      NaN
    
    
      1995-03-31
      10001
      1995-03-31
      0.006377
      NaN
    
    
      1995-04-28
      10001
      1995-04-28
      0.000000
      NaN



In [43]:

    
def sampling_function(number_in_sample, number_of_repeats):
    """
    Takes a random sample of stocks and calculates average monthly return for a group of them
    Data is stored in a csv file for further analysis
    Args:
    number_in_sample: number of different stocks
    number_of_repeats: times the function was run
    """
    for n in xrange(1, number_of_repeats):
        dfList = df["PERMNO"].tolist()
        rand_smpl = [ dfList[i] for i in sorted(random.sample(xrange(len(dfList)), number_in_sample)) ]
#         with open('filename.csv', 'a') as myfile:
#             wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
#             wr.writerow(rand_smpl)

        random_sample_df = df[df["PERMNO"].isin(rand_smpl)]
        series_sum_monthly = random_sample_df.groupby( pd.TimeGrouper(freq='M'))["RET"].sum()
        frame_sum_monthly = series_sum_monthly.to_frame()
        frame_sum_monthly["Ratio"] = frame_sum_monthly.RET/number_in_sample
        frame_sum_monthly["Variance"] = frame_sum_monthly.RET.var()
        
        pp = frame_sum_monthly["Variance"].iloc[0]
        


        with open('variance.csv', 'a') as myfile:
            wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
            wr.writerow([pp])
        frame_sum_monthly.to_csv('filename.csv',mode = 'a',header ='column_names')
        
    lista = lista.append(pp)
        
    rezultat = sum(lista)/number_in_sample
        
    return rezultat



In [44]:

    
#example of a function call
b = sampling_function(1, 100)



In [42]:

    
for n in xrange(df["PERMNO"].nunique())
    sampling_function(n, 1000)









    Out[42]:





2238



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	PERMNO	date	RET	number_of_values
1994-12-30	10001	1994-12-30	-0.033433	NaN
1995-01-31	10001	1995-01-31	-0.031250	NaN
1995-02-28	10001	1995-02-28	-0.026210	NaN
1995-03-31	10001	1995-03-31	0.006377	NaN
1995-04-28	10001	1995-04-28	0.000000	NaN