In [1]:
import os
import pandas as pd
import matplotlib
import numpy as np
from pandas import Series, DataFrame
import csv
from scipy import stats
import matplotlib.pyplot as plt
import random
import matplotlib.colors as colors
from datetime import date
from datetime import time
from datetime import datetime
import sys
import matplotlib.image as image
from matplotlib.gridspec import GridSpec
In [2]:
## these are some of the methods used to clean the data prior to loading in the scripts for analysis
## You can do this how ever you like, but the goal is to have a matrix/dataframe/table
## That has location information(region, city), beach name, length(or area) then the litter data
#looking for and indenifying the folder that contain slr data
import re
thisDir = os.getcwd()
dirList = os.listdir(thisDir)
folderList = [x for x in dirList if not os.path.isfile(os.path.join(thisDir, x))]
t=re.compile('cedre')#<------ search parameters
theFolders = [x for x in folderList if t.search(x) != None]#<---- this makes the list of folders only in the cwd
if len(theFolders)>1:
print(theFolders)#<--- if there is more than one match give me the list
else:
myFolder = os.path.join(thisDir, theFolders[0])
getIt = os.listdir(myFolder)
print(getIt)#<--------- if there is just one match make a path
In [3]:
## Construct a path to your csv
## Then call .read_csv
thisCSV = myFolder + '/' + getIt[-3]
df = pd.read_csv(thisCSV, index_col=False)
In [4]:
# call df.columns and take a look
# Sometimes there columns that created in translation or output that are not part of the data
# There are two ways to do this, 1- drop what you don't need or 2-extract what you do need
# But first get rid of all columns that are empty or have an 'Na' or 'Nan"
df.columns
Out[4]:
In [5]:
# in this case the there are at least eleven columns that have nothing to do with survey data
# the script does its own material calculations based on the user criteria file
# the first one is Unnamed that is the result of not turning off the index label when the file is wrtten to csv
colDrop = [df.columns[2]]
colDrop.append(df.columns[0])
df.drop(colDrop, axis=1, inplace=True)
df.columns
Out[5]:
In [6]:
df.index
Out[6]:
In [7]:
# call df.columns and see what you have
# so by procees of elimination we have settled for option 1
# we have gotten rid of almost all the columns that are not needed
# there is one left : 'Total', this is not calculated by the script
# therefore it is impossible to know how it got there
In [8]:
# without visuallising this it is still hard to 'tell' what is in the output.
# one thing is to eliminate all na values from the matirx/dataframe
df.fillna(0, inplace=True)
# all dataframe values are now numeric!
# Drop the exisitng total column (there is no way to know how it was really calculated)
#df.drop('Total', axis=1, inplace=True)
# create another Total columns
itemTotal = df.columns[4:]
df['Total'] = df[itemTotal].sum(axis=1)
# with a verified total get rid of any row values with a Total = 0 or less than 1
df = df[df.Total > 0]
lessThan = []
for p in list(df.columns[4:]):
b = df[p].sum()
if b == 0:
lessThan.append(p)
df.drop(lessThan, axis=1, inplace=True)
# at this point analysis is possible of all litter values
# export this to csv and save for later
df.to_csv('cedreClean.csv', index_label=False)
df.drop('Total', axis=1, inplace=True)#<----- this needs to go for now, but we will put it back in later
In [9]:
df.columns
Out[9]:
In [10]:
# the oupout is destined to a large audience, so being able to identify where these locations are is important
# take a look at beaach names and see if all of that makes sense
# eachrow represents a survey so, the same name will appear multiple times
# get rid of the doubles
names =[]
for x in list(df['Beach name']):
if x not in names:
names.append(x)
names
# this is where things get sticky, the GPS coordinates need to be inlcuded
# output can be used for mapping and everything else
# get the gps from data provded by the users
# create a dict that relates name provided to gps before changing anything else
Out[10]:
In [12]:
choices = names #<----------- this is the list of locations that can be chosen later
# Choose a location to graph
for i, x in enumerate(choices):
print(str(i) +' - '+ str(x), sep=', ')
userChoice = int(input('\nenter the index number of the city of your choice : ' ))
In [13]:
# the user preferences are ready to go
# need to enter the criteria from the user
# this allows the report data to be in any language
# the default model is in english however, if somebody makes criteria file
# in another language we will make that available too
# the reason the german criteria file works for this data set is because it was generated from this set
# therefore all the codes in the users data are in the criteria file
# furthermore report narratives can be adjusted
# summary table text is entered as variables, as well as chart titles
# and the user has the option to upload those as well
In [14]:
# upload the criteria file
criteria = ('cedre/cedreDict.csv')
In [15]:
# run it thorught the function
# the output is a dataFrame with mlw codes as column headers and the criteria as rows
def userCriteria(csvFile):
s1 = df.iloc[0:1, 4:]
s2 = s1.to_dict('list')
d1 = pd.read_csv(csvFile)
cols = d1.columns
if 'Unnamed: 0' in cols:
d1.drop(['Unnamed: 0'], axis=1, inplace=True)
#d1.set_index(['Type'], inplace=True)
d2 = d1.to_dict('list')
thisList = {}#<--------- an empty list to store our dcitionary in
# iterate through all the G codes in the user data
for i, x in enumerate(s2.keys()): #<------ Just looking at the dict keys
# compare those to the G codes that are in the new criteria
for y in (d2.keys()):
# every G code provided by the user
# is compared to the G code in the criteria list
# so if a code in s2 is in d2 the following happens
if x == y:
# x = 'Some Gcode' from the list such as 'G23'
# if a match is found
# the G code is matched to the new criteria
q = d2[x] #<----------- This picks up the value from the key, value pair in d2
# stored in a dict
new = {x: q} #<------- the Gcode from s2(aka x) picks up the value fromt the corresponding code in d2
# that dict is put in a list
thisList.update(new)
ds = pd.DataFrame(thisList)
#saveString = str(folder) + 'your_Criteria.csv'
#ds.to_csv(saveString)
#print('\nThe criteria has been matched to your data, the file is stored under ' + str(saveString))
return ds
ds = userCriteria(criteria)
In [16]:
#the data is ready to go and can be sorted using the criteria provided
# however the user has not identified what the data in each row of the criteria is
# there is afunction for that:
def sortIt():
print('''\nThe contents of the criteria file need to be named.
Please provide a one word description for the contents of each row.
The minimum size criteria file is three rows plus the column headers
-one row for your specific criteria
-one row for a description of the MLW code
-one row for the material\n''')
print('''If the order and names of the rows in your criteria file are:
\nRow one : Material
\nRow two : Description
\nRow three : Source
\nOr if you are using the criteria file provided you can skip the next three questions''')
skip = input('Skip the next three questions?(y/n) ')
if skip == 'y':
firstRow = 'Material'
#secondRow = 'Description'
#thirdRow = 'Source'
#ds.insert(0, 'Code', [firstRow, secondRow, thirdRow])
#ds.set_index('Code', inplace=True, drop=True)
else:
firstRow = input('Please provide a one word desription for the contents of the first row: \n')
#secondRow = input('\nTitle for the second row: ')
#thirdRow = input('\nTitle for the third row: ')
ds.insert(0, 'Code', [firstRow])#, secondRow, thirdRow])
ds.set_index('Code', inplace=True, drop=True)
mySort = input('\nWhich one would you like to use to group your data? ')
#myMat = input('\nWhich one is the material type? ')
dSortx = ds.loc[mySort]
#dMatx = ds.loc[myMat]
dsort = dSortx.to_dict()#<--------- this will be used to group by source
#dmat = dMatx.to_dict()#<---------this will group by material
return [dsort]
dsort = sortIt()
print('''At the end of the this script you can do other analysis on the formatted data
by calling 'userFile', it is a pandas DataFrame with the index set to City, Location, Date''')
# use the dict to create a groupby instance
# for each row in the criteria file you can create a groupby instance
# however row two is description so that wouldnt bevery helpful
def makeDfs():
df.set_index(['Beach name', 'Survey date','Period'], inplace=True)
userFile = df.copy()
source = df.groupby(dsort, axis=1)#<-----groupby instance
#mat = df.groupby(dmat, axis=1)#<-----groupby instance
# here is the data that will be charted
oneMore = pd.DataFrame(source.sum())
#oneMat = pd.DataFrame(mat.sum())
#oneMat['Total'] = oneMat.iloc[:, 1:].sum(axis=1)
oneMore['Total'] = oneMore.iloc[:, 0:].sum(axis=1)
userFile['Total'] = df.iloc[:, 1:].sum(axis=1)
return [oneMore, userFile]
oneMore, userFile = makeDfs()
# export the different data frames as csv
# if somebody wants to verify the work or build their own anlaysis they can
def makeCsvs(x, y):
for n, data in enumerate(x):
name = y[n]
saveString = 'cedre/' + name + '.csv'
data.to_csv(str(saveString), index_label=True)
makeCsvs([oneMore, userFile], ['oneMore', 'userFile'])
In [114]:
q = userFile.columns[1:-1]
p = q.str.extract(':(\D+)\\[', expand=False)
abbrevName = []
#p.str.extract('(\D+)\\[')
for n, k in zip(q,p):
namedict = {n:k}
abbrevName.append(namedict)
In [179]:
####this section turns top ten items(plus the other objects)
####into a percentage of the whole. by dividing by the Total column
## the data is used for ax1
# outPut is a DataFrame with 1 row and 11 columns
# row name is the location
def topTen(file):
k = list(file.columns)
l = Series(file[k[1:-1]].sum(), name='Lake')
l = l.sort_values(ascending=False)
tenx = l.index.values[:10]
elevenX = l.index.values[10:]
checkThis = pd.DataFrame(file[tenx])
theStuff = Series(file[elevenX].sum(axis=1), name='All other objects')
checkThis = pd.concat([checkThis, theStuff], axis=1)
this = pd.DataFrame(file['Total'])
thisRightHere = pd.concat([checkThis, this], axis=1)
return thisRightHere
def the2ColumnDivider(thedataframe, theDenominator):
df = list(thedataframe.columns[0:-1])
anotherList = []
anotherDF = pd.DataFrame()
for i, x in enumerate(df):
op = thedataframe[df[i]]
op2 = thedataframe[theDenominator]
opRatio = (op/op2)*100
thisRatio = Series(opRatio, name=df[i])
anotherDF = pd.concat((anotherDF, thisRatio), axis=1)
return anotherDF
def theDenseDivider(theDataFrame, theSorter, percent):
df = list(theDataFrame.columns[1:])
anotherList = []
anotherDF = pd.DataFrame()
for i, x in enumerate(df):
op = theDataFrame[df[i]]
op2 = 100
if percent == 'y':
opRatio = (op/op2)*100
else:
opRatio = op/op2
thisRatio = Series(opRatio, name=df[i])
anotherDF = pd.concat((anotherDF, thisRatio), axis=1)
densityGroup = anotherDF.groupby(theSorter, axis=1)
sourceDensity = pd.DataFrame(densityGroup.sum())
return sourceDensity
def noSortDenseDivider(theDataFrame, theDenominator, theSorter, percent):
df = list(theDataFrame.columns[1:])
anotherList = []
anotherDF = pd.DataFrame()
for i, x in enumerate(df):
op = theDataFrame[df[i]]
op2 = 100
if percent == 'y':
opRatio = (op/op2)*100
else:
opRatio = op/op2
thisRatio = Series(opRatio, name=df[i])
anotherDF = pd.concat((anotherDF, thisRatio), axis=1)
densityGroup = anotherDF.groupby(theSorter, axis=1)
sourceDensity = pd.DataFrame(densityGroup.sum())
return sourceDensity
def topTenLength(file):
k = list(file.columns)
l = Series(file[k[1:-1]].sum(), name='Lake')
l = l.sort_values(ascending=False)
tenx = l.index.values[:10]
elevenX = l.index.values[10:]
checkThis = pd.DataFrame(file[tenx])
theStuff = Series(file[elevenX].sum(axis=1), name='Other')
checkThis = pd.concat([checkThis, theStuff], axis=1)
this = pd.DataFrame(userFile['length'])
thisRightHere = pd.concat([checkThis, this], axis=1)
return thisRightHere
### this function divides the columns by either the total or the length
## you can select either percent 'y' (divide by total)
## or 'n' (divided by length)
# outPut is a dataFrame
def thetopTenDivider(theDataFrame, theDenominator, percent):
df = list(theDataFrame.columns[:-1])
anotherList = []
anotherDF = pd.DataFrame()
for i, x in enumerate(df):
op = theDataFrame[df[i]]
op2 = theDataFrame[theDenominator]
if percent == 'y':
opRatio = (op/op2)*100
else:
opRatio = op/op2
thisRatio = Series(opRatio, name=df[i])
anotherDF = pd.concat((anotherDF, thisRatio), axis=1)
return anotherDF
def thetopTenDense(theDataFrame, theDenominator):
df = list(theDataFrame.columns[0:-1])
anotherList = []
anotherDF = pd.DataFrame()
for i, x in enumerate(df):
op = theDataFrame[df[i]]
op2 = theDenominator
opRatio = op/op2
thisRatio = Series(opRatio, name=df[i])
anotherDF = pd.concat((anotherDF, thisRatio), axis=1)
return anotherDF
def otherTopTen(theReferenceData, theData):
thisUnDf = pd.DataFrame()
for gCode in list(theReferenceData.index[:-1]):
thisUnItem = Series(theData.loc[gCode], name=gCode)
thisUnDf = pd.concat((thisUnDf, thisUnItem), axis=1)
return thisUnDf
def dropColAndSum(dataToDropFrom, otherTopTenColsToDrop, lengthOrTotalcolumn, nameOtherObjects):
dropMe = dataToDropFrom.drop(list(otherTopTenColsToDrop.columns))
dropMe = Series(dropMe.drop(lengthOrTotalcolumn).sum(), name=nameOtherObjects)
theOtherTopTen = pd.concat((thisUnDf, dropMe), axis=1)
return theOtherTopTen
# top-ten-% of total
# use the userFile
# accepts either a slice or the entire data frame
def topTenInput(dataFrame):
x = pd.DataFrame(Series(topTen(dataFrame).sum()))
xtrans = x.transpose()
ax1Data = the2ColumnDivider(xtrans, 'Total')
ax1Data = ax1Data.iloc[0].sort_values(ascending=False)
return ax1Data
#material as % of total
# use the userFile
# accepts either a slice or the entire data frame
def matPerInput(dataFrame):
q = theDenseDivider(dataFrame, 'Total', dmat, 'y')
ax2Input = pd.DataFrame(Series(q.mean(), name=x2)).transpose()
ax2Input = ax2Input.iloc[0].sort_values(ascending=False)
return ax2Input
# Source density input
# use the userFile
# accepts either a slice or the entire data frame
def sourceDenInput(dataFrame):
q = theDenseDivider(dataFrame, dsort,'n')
ax3Input = pd.DataFrame(Series(q.mean())).transpose()
ax3Input = ax3Input.iloc[0].sort_values(ascending=False)
return ax3Input
def topTenDensity(dataFrame):
q = pd.DataFrame(Series(topTen(dataFrame).mean()))
transq = q.transpose()
topTenx = thetopTenDense(transq, 100)
topTenx = topTenx.iloc[0].sort_values(ascending=False)
return topTenx
###this section gets the info for the summary data
# outPut is string for ax5
# use the userFile
# accepts either a slice or the entire data frame
def summaryString(dataFrame):
nSamples = len(list(dataFrame.index.get_level_values(0)))#<----- number of samples
averageDensity = sourceDenInput(dataFrame).sum().sum().round(2)#<------ averaged density
totalNumber = dataFrame['Total'].sum()#------------ total number of pieces
dates = dataFrame.index.get_level_values('Survey date')#<------- getting the dates from the index
pd.to_datetime(dates)
theEarliest = min(dates)#<--------- the earliest date
theLatest = max(dates)#<------------- the latest date
logOfAverage = np.log(averageDensity)
#summaryData = userChoice
summaryData2 = ('Total number of pieces: ' + str(totalNumber) +
'\nAverage density: ' + str(averageDensity) + ' pcs/m' +
'\nFirst sample: ' + str(theEarliest) +
'\nLast sample : ' + str(theLatest) +
'\nNumber of samples: ' + str(nSamples) +
'\n')
return [summaryData2, nSamples, logOfAverage]
def logsDense(dataFrame):
z = dataFrame.copy()
z['Dense'] = z['Total']/100
z['Logs'] = z['Dense'].apply(np.log)
mu, sigma = stats.norm.fit(z['Logs'])
points = len(z['Logs'])
theLogs = z['Logs']
return [mu, sigma, points, theLogs]
In [191]:
colors = ['#003333','#339966', '#66CCCC',
'#3399CC', '#993333', '#CC6600', '#FF0033',
'#FFCC00', '#3366CC', '#66CC00', '#333300',
'#FF0000', '#ec00ff', '#00ff32']
def siteReport(dataFrame, dataFrameAll):
one = topTenInput(dataFrame)
two = sourceDenInput(dataFrame)
three = topTenDensity(dataFrame)
summaryData2, nSamples, logOfAverage = summaryString(dataFrame)
summaryData = summaryTitle
mu, sigma, points, logs= logsDense(dataFrame)
theMu, theSigma, thePoints, theLogs = logsDense(dataFrameAll)
fig = plt.figure(figsize=(8.27, 11.69), frameon=False, edgecolor='000000', linewidth = 1)
rect0 = .65, .71, .1, .24
rect1 = .65, .36, .1, .24
rect2 = .13, .36, .1, .24
rect3 = .23, .7, .06, .06
rect4 = .08, .9, .4, .05
rect5 = .08, .82, .4, .09
rect6 = .13, .07, .4, .195
rect7 = .6, .12, .3, .09
rect8 = .62, .16, .2, .1
rect9 = .08, .66, .3, .02
ax1 = fig.add_axes(rect0)#<-----------x2TopTenSummary
ax2 = fig.add_axes(rect1)#<-----------matDensitySummaryLocation
ax3 = fig.add_axes(rect2)#<-----------sourceDensitySummaryLocation
ax4 = fig.add_axes(rect3)#<-----------performance
ax5 = fig.add_axes(rect4)#<-----------summaryDataTitle
ax8 = fig.add_axes(rect7)#<-----------summaryData
ax6 = fig.add_axes(rect5)#<-----------summaryData2
ax7 = fig.add_axes(rect6)#<----------- comparison chart
ax9 = fig.add_axes(rect8)#<----------- image
anuther = [ax1, ax2, ax3] # [ax1, ax2, ax3]
data = [one, two, three]
unitsLabel = 'meter'
for t, r in enumerate(data):
blocks = 0.1
width = 0.6
bottom = 0
z = r.index
q = anuther[t]
lables = []
for c in z:
for n in abbrevName:
for key, value in n.items():
if c == key:
lables.append(value)
if len(lables) > 0:
lables.append('Other objects')
else:
lables = z
for i, u in enumerate(z):
color = colors[i]
q.bar(blocks, r[u], width=width, bottom = bottom, color = color, label= lables[i])
bottom += r[u]
handles, labels = q.get_legend_handles_labels()
q.legend(handles[::-1], labels[::-1], bbox_to_anchor=(1.05, 1), borderaxespad=0., fancybox=False, frameon=False)
if t == 0:
title = 'Top-ten items found'
ylabel = 'Topten objects as a % of total'
q.set_ylim(top=102)
elif t == 1:
title = 'Materials'
ylabel = 'Material pieces per meter'
elif t == 2:
title = 'Top-ten items per meter'
ylabel = 'Pieces of trash per ' + unitsLabel
q.set_title(title, size=14, loc='left')
q.set_ylabel(ylabel, size = 12)
q.tick_params(axis='x', labelbottom='off', bottom='off', top='off')
nSamples = nSamples
if nSamples > 1:
noSample = 'samples'
else:
noSample = 'sample'
q.set_xlabel('n= ' + str(nSamples) + ' ' + noSample, size=12)
q.xaxis.labelpad = 20
percen = stats.percentileofscore(theLogs, logOfAverage).round(0)
if percen <= 16:
color = 'g'
elif 17 <= percen <= 49:
color = 'y'
elif 50 <= percen < 83:
color = '#FF4500'
elif 84 <= percen:
color = 'r'
zText = 'Density is greater than '
theRest = '%\n of all other locations'
ax4.scatter(1, 1, s=500, color = color)
ax4.set_title('Overall performance')
ax4.set_xlabel(zText + str(percen) + theRest)
ax4.tick_params(axis='both', labelbottom='off', bottom='off', top='off', left='off')
ax4.set_yticklabels([])
ax4.set_xticklabels([])
ax5.text(0, 1, summaryData , size=16, wrap=True, verticalalignment='top')
ax6.text(0, 1, summaryData2, size=14, verticalalignment='top')
ax5.set_axis_off()
ax6.set_axis_off()
theZscore = (theLogs.mean() - theMu)/theSigma
sixteenth = theMu + -1*theSigma
eightyFourth = theMu + 1*theSigma
ratio = theMu/theSigma
zLessThanNegOne = sixteenth
zLessThanZero = theMu
zLessThanOne = eightyFourth
def logNormalCurve(theData, numberOfpoints):
xmin = min(theData)
xmax = max(theData)
x = np.linspace(xmin, xmax, numberOfpoints)
pdf = stats.norm.pdf(x, loc=theMu, scale=theSigma)
return [x, pdf, xmin, xmax]
x, pdf, xmin, xmax = logNormalCurve(theLogs, thePoints)
px = logOfAverage
#z['Dense'] = z['Total']/100
py = stats.norm.pdf(px, loc=theMu, scale=theSigma)
ax7.plot(x, pdf, 'k', linestyle='dashed', color='b', alpha=1)
ax7.fill_between(x, stats.norm.pdf(x, loc=theMu, scale=theSigma), where=(x <= px), color='g', alpha= 0.3)
ax7.vlines(px, 0, stats.norm.pdf(theMu, loc=theMu, scale=theSigma), color='b', linestyle ='-.')
ax7.scatter(px, py, s=200, color = 'r')
ax7.spines['bottom'].set_position('zero')
ax7.set_title('The distibtuion of all values and this location')
ax7.set_xlabel('The ln of litter density', size=14)
today = date.today()
closingText = 'This report printed on ' + str(today) + '\nCreated using python 3.62\n Math, trash and water\n www.hammerdirt.ch'
ax8.text(0, 0, closingText , size=12, wrap=True, verticalalignment='top')
ax8.set_axis_off()
imageFile = 'weblogo.jpg'
im = image.imread(imageFile)
ax9.imshow(im, aspect='auto', zorder=-1)
ax9.set_axis_off()
from matplotlib.backends.backend_pdf import PdfPages
plt.savefig(summaryData, bbox_inches = 'tight')
saveDocString = str(summaryData) + '_litter_Density.pdf'
pp = PdfPages(saveDocString)
fig.savefig(pp, format='pdf')
pp.close()
plt.show()
plt.close()
In [192]:
for x in choices:
summaryTitle = str(x[:25])
siteReport(userFile.loc[x], userFile)
In [ ]: