In [1]:
# Python version
import sys
print('Python: {}'.format(sys.version))
# scipy
import scipy
print('scipy: {}'.format(scipy.__version__))
# numpy
import numpy
print('numpy: {}'.format(numpy.__version__))
# matplotlib
import matplotlib
print('matplotlib: {}'.format(matplotlib.__version__))
# pandas
import pandas
print('pandas: {}'.format(pandas.__version__))
# scikit-learn
import sklearn
print('sklearn: {}'.format(sklearn.__version__))
# In addition to the online example, these packages below help dealinf with the file system
import glob
import os
In [2]:
# Load libraries
import pandas
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
In [5]:
# Set the directory to parse, and the column names
dir = "c://Users/Adam Shelboourne/Uni/Physics/Astr310/Python/mu/daily/"
#dir = "/Users/AdamShelbourne/Uni/Physics/Astr310/Python/sso/daily"
# Set the column names for the data
names = ['UTCDateTime', 'LocalDateTime', 'Temperature', 'Counts', 'Frequency', 'MSAS']
In [4]:
# Set the common time grid onto which each night is sampled. The common grid
# is defined by the start time defined below, and the number of hours from that time
beginh = 16 # Hour of start time (hour)
beginm = 30 # Hour of start time (minute)
nhrs = 15 # Number of hours to extend time grid
# Output file name for storing the big 2D array of resampled SQM values. Remove the file
# if it already exists.
outfile = dir + 'all_data.dat'
os.remove(outfile) if os.path.exists(outfile) else None
# Open up files for the outputs. There is one for the SQM float values, and another for
# storing the data of each night. These are merged later, but couldn't easily see how to
# do this directly in one file, due to mixed data types.
f=open(dir+'all_data.dat') #,'a+b')
f2=open(dir+'dates.dat','w')
# Parse the data directory and pick up all the relevant data file names. Note that the directory
# listing will set the order of the files. The default pysqm.py output files are dated, so this should
# be OK.
for name in glob.glob(dir+'20*.dat'):
# Read in the data set
dataset = pandas.read_csv(name,comment="#",sep=';',names=names)
#Establish Local Time as index
times = pandas.to_datetime(dataset['LocalDateTime'])
dataset.index=times
# Set up the common time span, but adapted for the current data's date
rng = pandas.date_range(pandas.to_datetime(dataset['LocalDateTime']).dt.date[0] + pandas.DateOffset(hours=beginh,minutes=beginm), periods=nhrs*60, freq='T')
# Set the new time grid as the index
d = {'Date' : pandas.Series(rng)}
df = pandas.DataFrame(d)
dates = pandas.to_datetime(df['Date'])
df.index = dates
# Resample the original data to minutes, spanning the common grid
start = dates.min().strftime('%H:%M:%S')
end = dates.max().strftime('%H:%M:%S')
sub = dataset.between_time(start_time=start,end_time=end,include_start=True,include_end=True).resample('T').mean().interpolate(method='linear')
# Combine data with this grid, thus putting it on the common frame
comb = df.join(sub['MSAS'])
# Save data as new line in output file
numpy.savetxt(f,[numpy.array(comb['MSAS'])[0:(nhrs*60)-1]])
f2.write(str(pandas.to_datetime(dataset['LocalDateTime']).dt.date[0])+'\n')
# Close the files
f.close()
f2.close()
In [ ]: