In [1]:
import pandas as pd
import numpy as np
import json
import os
In [2]:
# Read in raw csv data, skipping the first few rows (just metadata) and converting NA values
df = pd.read_csv('Devonshire_Green_meteorological_data-raw.csv.gz', skiprows=range(4), na_values=['No data'])
In [3]:
# Ignore 'status' columns as just metadata / units - store elsewhere
df_data_cols = [c for c in df.columns if 'Status' not in c]
df_data = df[df_data_cols].copy()
In [4]:
# Convert sensor data from objects to numpy numeric data type
df_data.iloc[:, 2:] = df_data.iloc[:, 2:].apply(pd.to_numeric)
In [5]:
# Drop the last row as contains duff data
df_data = df_data[df_data.Date != 'End']
In [6]:
# Split the 'Date' column into 'Year', 'Month' and 'Day' integer columns, stored in a new dataframe
date_parts = df_data.Date.str.split('-').apply(pd.Series).apply(pd.to_numeric)
date_parts.rename(columns={k: v for k, v in enumerate(('Year', 'Month', 'Day'))}, inplace=True)
# then merge this back into our main dataframe
df_data = pd.concat([df_data, date_parts], axis=1)
In [7]:
# Convert the 'Date' and 'Time' columns to numpy datetime and timedelta datatypes respectively
df_data['Time'] = pd.to_timedelta(df_data['Time'])
df_data['Date'] = pd.to_datetime(df_data['Date'])
# Determine the 'Hour' of the day as a separate column using the 'Time' and 'Date' columns
df_data['Hour'] = (((df_data.Date + df_data.Time) - df_data.Date) \
.apply(lambda dt: dt.total_seconds()) / 3600) \
.astype(np.int)
# Replace all 'Hour' values of 24 with 0
df_data['Hour'] = df_data['Hour'].replace(24, 0)
# Drop the 'Date' and 'Time' columns as they are not needed any more
df_data.drop(['Date', 'Time'], axis=1, inplace=True, errors='ignore')
In [8]:
# Reorder columns so that 'Year', 'Month', 'Day' and 'Hour' come first:
df_col_names = list(df_data)
df_data = df_data[df_col_names[-4:] + df_col_names[:-4]]
In [9]:
# Shorten some column names
df_data.rename(columns={c: c.replace(' (Hourly measured)', '') for c in df_data.columns}, inplace=True)
In [10]:
df_data.head()
Out[10]:
In [11]:
# Load metadata
with open('Devonshire_Green_meteorological_metadata-preproc.json', 'r') as f:
metadata = json.load(f)
metadata
Out[11]:
In [12]:
# Plot all data
import matplotlib.pyplot as plt
plt.close('all')
%matplotlib notebook
df_data.iloc[:, 4:].plot(subplots=True)
plt.show()
In [58]:
# Check that can get datetime64 array from date part columns
def date_parts_to_datetime64(year, month=0, day=0, hour=0):
return np.array(['{:4>2.0f}-{:0>2.0f}-{:0>2.0f} {:0>2.0f}:00:00'.format(y, m, d, h)
for (y, m, d, h) in np.broadcast(year, month, day, hour)],
dtype='datetime64')
df_col_names = list(df_data)
arr_2d = df_data.values
dt64s = date_parts_to_datetime64(*arr_2d[:, :4].T)
dt64s.shape, dt64s.dtype.name, dt64s.min(), dt64s.max()
Out[58]:
In [60]:
df_data.to_csv('Devonshire_Green_meteorological_data-preproc.csv', index=False)
In [ ]: