In [33]:
import os
import pandas as pd
import matplotlib
import numpy as np
from pandas import Series, DataFrame
import csv
from scipy import stats
import matplotlib.pyplot as plt
import random
import matplotlib.colors as colors
from datetime import date
from datetime import time
from datetime import datetime
import sys
import matplotlib.image as image
folder =[]
thisq = os.getcwd()
thisx = os.listdir(thisq)
for item in thisx:
if not os.path.isfile(os.path.join(thisq, item)):#<-------This here checks if items are files
folder.append(item)#<--------- if not file append to the empty list
print('\nHere is a list of folders in the current directory:\n')
print(folder)
In [34]:
d1 = pd.read_csv('calData/cWeek1.csv')
d2 = pd.read_csv('calData/cCreek2.csv')
d1.drop('Total', inplace=True, axis=1)
d2.drop('Total', inplace=True, axis=1)
In [35]:
def colDrop(userFile):
lessThan = []
#this gets rid of the empty columns
for p in list(userFile.columns[3:]):
b = userFile[p].sum()
if b == 0:
lessThan.append(p)
userFile = userFile.drop(lessThan, axis=1)
#use the total to check for empty rows
userFile['Total'] = userFile.iloc[:, 3:].sum(axis=1)
userFile = userFile[userFile.Total > 0]
#userFile.reset_index(drop=True, inplace=True)
#userFile.drop(['Total'], axis=1, inplace=True)
#set the date column
userFile['Date'] = pd.to_datetime(userFile['Date'])
return userFile
d1 = colDrop(d1).copy()
d2 = colDrop(d2).copy()
In [36]:
d1['Location'][2] = 'Antelope'
d1.columns
d1.set_index(['Location', 'Date'],inplace=True)
d2.set_index(['Location', 'Date'], inplace=True)
In [37]:
import re
justGd1 = d1.select(lambda x: re.search('G\d+', x), axis=1)
justGd1.insert(0, 'Length', d1['Length'])
justGd2 = d2.select(lambda x: re.search('G\d+', x), axis=1)
justGd2.insert(0, 'Length', d2['Length'])
myFrames = [justGd1, justGd2]
joined = pd.concat(myFrames, axis=0, join='outer', copy=True)
joined.fillna(0, inplace=True)
joined.insert(0, 'Lengths', joined['Length'], allow_duplicates=False)
joined.drop(['Length'], axis=1, inplace=True)
joined=joined.rename(columns = {'Lengths':'length'})
joined.reset_index(level=1, drop=False, inplace=True)
joined.reset_index(drop=False, inplace=True)
joined.to_csv('calData/allData.csv', index_label = False, index=False)
In [44]:
import os
import pandas as pd
import matplotlib
import numpy as np
from pandas import Series, DataFrame
import csv
from scipy import stats
import matplotlib.pyplot as plt
import random
import matplotlib.colors as colors
from datetime import date
from datetime import time
from datetime import datetime
import sys
import matplotlib.image as image
# coding: utf-8
# this script is designed to take the data collected from MLW surveys and generate automated reports
task = ['0 - Create Summary Report for Project', '1 - Create Summary Report for city', '2 - Create Summary Report for Beach']
print(task)
theTask = int(input('''\n\nWith this script you can do prepare one of three reports.
your data needs to be in the correct format and located in the curent working directory
please choose one of the three options listed (the number) '''))
userCSV = 'calData/allData.csv'
#input('\nIf your file meets these criteria, please enter the complete file name and path here: ')
criteria = 'criteriaFiles/newCriteria.csv'
#input('''\nThis script sorts records by city, beach and date. The records are then assigned criteria.
#please enter the complete file name and path of the criteria file here: ''')
imageFile = 'weblogo.jpg'
#input('What is the name of the image file you want to use for your logo? (6cm X 3cm): ')
userFile = pd.read_csv(userCSV, encoding='1252')
userCols = userFile.columns
if 'Unnamed: 0' in userCols:
userFile.drop(['Unnamed: 0'], axis=1, inplace=True)
thisIsmyDateColumn = 'Date'
#input('\nPlease specify(spell out) the name of the column that has the date criteria: ')
def saveTo():
folder =[]
thisq = os.getcwd()
thisx = os.listdir(thisq)
for item in thisx:
if not os.path.isfile(os.path.join(thisq, item)):#<-------This here checks if items are files
folder.append(item)#<--------- if not file append to the empty list
print('\nHere is a list of folders in the current directory:\n')
print(folder)
aLocation = input('''\n\nPlease indentify the folder where you would
like the report and other output data to be saved (folder name) : ''')
afolder = aLocation + '/'
return afolder
folder = saveTo()
###getting rid of columns with no values:
## source: hammerdirt
def colDrop(userFile):
saveString = folder + 'yourData_noZeroes_plusDaily_Total.csv'
lessThan = []
#this gets rid of the empty columns
for p in list(userFile.columns[3:]):
b = userFile[p].sum()
if b == 0:
lessThan.append(p)
userFile.drop(lessThan, axis=1, inplace=True)
#use the total to check for empty rows
userFile['Total'] = userFile.iloc[:, 3:].sum(axis=1)
charts3 = userFile.copy()
charts3.to_csv(saveString)
#get rid of empty rows and put the file back to orignal state
userFile = userFile[userFile.Total > 0]
userFile.reset_index(drop=True, inplace=True)
userFile.drop(['Total'], axis=1, inplace=True)
#set the date column
userFile[thisIsmyDateColumn] = pd.to_datetime(userFile[thisIsmyDateColumn])
return userFile
userFile = colDrop(userFile)
In [45]:
changeColumnNames = input('''\nThe first columns of the data file should be "Location". The default
for this script is and "Location". The column names are used to set the index and define
how the data is sorted geographicaly. If you have formatted the file to the default values you can skip the next three
questions. Skip? (y or n): ''')
if changeColumnNames == 'y':
#cityLabel = 'Commune'
beachLabel = 'Location'
#lengthLabel = 'length'
else:
cityLabel = input('\nPlease verify the name of the column name that holds the city names: ')
beachLabel = input('\nPleae verify the name of the column that holds the beach names: ')
lengthLabel = input('\nPlease verify the name of the column that holds the beach length: ')
#thisIsMyCity = cityLabel
thisIsMyBeach = beachLabel
#thisIsMyLength = lengthLabel
myIndex = [thisIsMyBeach , thisIsmyDateColumn]
print(['0 - meters', '1 - feet'])
unitsLabel = input('\nIs this in meters or feet? ')
if unitsLabel == 'meters':
units = 'meters'
else:
units = 'foot'
print('Units will be abbreviated like this: ' + str(units))
print('\nThe MLW codes in the criteria file are being compared and matched up with the codes in your data file')
def userCriteria(csvFile):
s1 = userFile.iloc[0:1, 4:]
s2 = s1.to_dict('list')
d1 = pd.read_csv(csvFile)
cols = d1.columns
if 'Unnamed: 0' in cols:
d1.drop(['Unnamed: 0'], axis=1, inplace=True)
#d1.set_index(['Type'], inplace=True)
d2 = d1.to_dict('list')
thisList = {}#<--------- an empty list to store our dcitionary in
# iterate through all the G codes in the user data
for i, x in enumerate(s2.keys()): #<------ Just looking at the dict keys
# compare those to the G codes that are in the new criteria
for y in (d2.keys()):
# every G code provided by the user
# is compared to the G code in the criteria list
# so if a code in s2 is in d2 the following happens
if x == y:
# x = 'Some Gcode' from the list such as 'G23'
# if a match is found
# the G code is matched to the new criteria
q = d2[x] #<----------- This picks up the value from the key, value pair in d2
# stored in a dict
new = {x: q} #<------- the Gcode from s2(aka x) picks up the value fromt the corresponding code in d2
# that dict is put in a list
thisList.update(new)
ds = pd.DataFrame(thisList)
saveString = str(folder) + 'your_Criteria.csv'
ds.to_csv(saveString)
print('\nThe criteria has been matched to your data, the file is stored under ' + str(saveString))
return ds
ds = userCriteria(criteria)
w = len(list(ds.columns))
print('\nThere are ' + str(w) + ' different MLW codes in your data.')
userCols = userFile.columns
def sortIt():
print('''\nThe contents of the criteria file need to be named.
Please provide a one word description for the contents of each row.
The minimum size criteria file is three rows plus the column headers
-one row for your specific criteria
-one row for a description of the MLW code
-one row for the material\n''')
print('''If the order and names of the rows in your criteria file are:
\nRow one : Material
\nRow two : Description
\nRow three : Source
\nOr if you are using the criteria file provided you can skip the next three questions''')
skip = input('Skip the next three questions?(y/n) ')
if skip == 'y':
firstRow = 'Material'
secondRow = 'Description'
thirdRow = 'Source'
ds.insert(0, 'Code', [firstRow, secondRow, thirdRow])
ds.set_index('Code', inplace=True, drop=True)
else:
firstRow = input('Please provide a one word desription for the contents of the first row: \n')
secondRow = input('\nTitle for the second row: ')
thirdRow = input('\nTitle for the third row: ')
ds.insert(0, 'Code', [firstRow, secondRow, thirdRow])
ds.set_index('Code', inplace=True, drop=True)
mySort = input('\nWhich one would you like to use to group your data? ')
myMat = input('\nWhich one is the material type? ')
dSortx = ds.loc[mySort]
dMatx = ds.loc[myMat]
dsort = dSortx.to_dict()
dmat = dMatx.to_dict()
return [dsort, dmat]
dsort, dmat = sortIt()
userFile.set_index(myIndex, inplace=True)
print('''At the end of the this script you can do other analysis on the formatted data
by calling 'userFile', it is a pandas DataFrame with the index set to City, Location, Date''')
# use the dict to create a groupby instance
# on the original data frame
source = userFile.groupby(dsort, axis=1)
mat = userFile.groupby(dmat, axis=1)
# here is the data that will be charted
oneMore = pd.DataFrame(source.sum())
oneMat = pd.DataFrame(mat.sum())
oneMat['Total'] = oneMat.iloc[:, 1:].sum(axis=1)
oneMore['Total'] = oneMore.iloc[:, 1:].sum(axis=1)
userFile['Total'] = userFile.iloc[:, 1:].sum(axis=1)
oneMore.to_csv('your_Data_sorted.csv')
print('\nYour data has been sorted and is ready to be presented based on your criteria\n')
print('\nThere is a copy stored locally under "your_Data_sorted.csv"\n')
x1 = userFile.index.values
def createChoices(indexValues):
cList = []
lList = []
#iterate through the list of tuples
for c, l in indexValues:
#use the position to seperate
cList.append(c)
lList.append(l)
#remove duplcates and save in seperate lists
levelOne = [i for n, i in enumerate(cList) if i not in cList[:n]]
levelTwo = [i for n, i in enumerate(lList) if i not in lList[:n]]
return [levelOne, levelTwo]
theChoices = createChoices(x1)
cityChoices = theChoices[0]
beachChoices = theChoices[1]
userFile.sort_index(inplace=True)
oneMore.sort_index(inplace=True)
oneMat.sort_index(inplace=True)
idx = pd.IndexSlice
if theTask == 1:
yourChoice = pd.Series(cityChoices)
print('\nThe report will be based on one of the items on this list:')
for i, x in enumerate(yourChoice):
print(str(i) +' - '+ str(x), sep=', ')
userChoice = int(input('\nenter the index number of the city of your choice : ' ))
myCityName = input('\nCan you confirm the name of your choice?\nThis will be included in report and chart titles: ')
x2= cityChoices[userChoice]
analyzeAll = userFile.copy()
analyzeThis = userFile.loc[x2].copy()
analyzeSource = oneMore.loc[x2].copy()
analyzeMat = oneMat.loc[x2].copy()
elif theTask == 2:
yourChoice = pd.Series(beachChoices)
print('\nThe report will be based on one of the items on this list:')
for i, x in enumerate(yourChoice):
print(str(i) +' - '+ str(x), sep=', ')
userChoice = int(input('\nenter the index number of your choice : ' ))
myCityName = input('\nCan you confirm the name of your choice?\nThis will be included in report and chart titles: ')
x2= beachChoices[userChoice]
analyzeAll = userFile.copy()
analyzeThis = userFile.loc[idx[:,[x2]], :].copy()
analyzeSource = oneMore.loc[idx[:,[x2]], :].copy()
analyzeMAt = oneMat.loc[idx[:,[x2]], :].copy()
elif theTask == 0:
myReportName = input('''You have chosen a summary report for all the data.
Can you give a name for the report, it will be used in chart titles: ''')
analyzeAll = userFile.copy()
analyzeSource = oneMore.copy()
analyzeMAt = oneMat.copy()
print('''\nThis module outputs several DataFrames with the same index:
- userFile (data cleaned up)
- oneMore(data sorted by user criteria)
- oneMat(data sorted by material)''')
####This gets the topTen values from the data
## Prepares the data to be used in ax1
# outPut is a DataFrame with 12 columns including a total column
# the data is selected by the location specified by the user
In [46]:
def topTen(file):
k = list(file.columns)
l = Series(file[k[1:-1]].sum(), name='Lake')
l = l.sort_values(ascending=False)
tenx = l.index.values[:10]
elevenX = l.index.values[10:]
checkThis = pd.DataFrame(file[tenx])
theStuff = Series(file[elevenX].sum(axis=1), name='All other objects')
checkThis = pd.concat([checkThis, theStuff], axis=1)
this = pd.DataFrame(file['Total'])
thisRightHere = pd.concat([checkThis, this], axis=1)
return thisRightHere
####this section turns top ten items(plus the other objects)
####into a percentage of the whole. by dividing by the Total column
## the data is used for ax1
# outPut is a DataFrame with 1 row and 11 columns
# row name is the location
def the2ColumnDivider(thedataframe, theDenominator):
df = list(thedataframe.columns[:-1])
anotherList = []
anotherDF = pd.DataFrame()
for i, x in enumerate(df):
op = thedataframe[df[i]]
op2 = thedataframe[theDenominator]
opRatio = (op/op2)*100
thisRatio = Series(opRatio, name=df[i])
anotherDF = pd.concat((anotherDF, thisRatio), axis=1)
return anotherDF
def theDenseDivider(theDataFrame, theDenominator, theSorter, percent):
df = list(theDataFrame.columns[1:])
anotherList = []
anotherDF = pd.DataFrame()
for i, x in enumerate(df):
op = theDataFrame[df[i]]
op2 = theDataFrame[theDenominator]
if percent == 'y':
opRatio = (op/op2)*100
else:
opRatio = op/op2
thisRatio = Series(opRatio, name=df[i])
anotherDF = pd.concat((anotherDF, thisRatio), axis=1)
densityGroup = anotherDF.groupby(theSorter, axis=1)
sourceDensity = pd.DataFrame(densityGroup.sum())
return sourceDensity
def theLocationBox(dataToBeCharted, string):
hereAgain = []
theBeachNames = stringx +':\n\n '
thePlaces = dataToBeCharted.index.get_level_values(0)
theLister = []
for spot in thePlaces:
if spot not in theLister:
theLister.append(spot)
for i, beach in enumerate(theLister):
if i < len(theLister) - 1:
theBeachNames = theBeachNames + ' ' + theLister[i] + '\n '
if i == len(theLister) -1:
theBeachNames = theBeachNames + ' ' + theLister[i] + '\n '
hereAgain.append(theBeachNames)
anotherString = ''
for p, q in enumerate(hereAgain):
anotherString = anotherString + hereAgain[p] + '\n'
return anotherString
#if theTask == 1 or 2:
#if theTask == 1:
#stringx = 'The locations in '
#myLocations = theLocationBox(densitySource, stringx) #<------ ax4 inPut
#else:
#stringx = 'This site is located in '
#myLocations = theLocationBox(densitySource, stringx) #<------ ax4 inPut
def stringMaker(columnSource, sourceColumns):
thisList = columnSource.columns
wordsList = []
for i, name in enumerate(thisList):
gun = list(sourceColumns.get_group(thisList[i]).columns)
thisString = 'The MlW codes included in ' + name + ':\n'
for i, code in enumerate(gun):
if i < len(gun) - 1:
if i%4 != 0:
thisString = thisString + gun[i] + ', '
elif i%4 == 0 and i != 0:
thisString = thisString + gun[i] + '\n'
elif i == len(gun) -1:
thisString = thisString + gun[i] + '\n'
wordsList.append(thisString)
anotherString = ''
for i, q in enumerate(wordsList):
anotherString = anotherString + '\n' + wordsList[i]
return anotherString
def topTenLength(file):
k = list(file.columns)
l = Series(file[k[1:-1]].sum(), name='Lake')
l = l.sort_values(ascending=False)
tenx = l.index.values[:10]
elevenX = l.index.values[10:]
checkThis = pd.DataFrame(file[tenx])
theStuff = Series(file[elevenX].sum(axis=1), name='Other')
checkThis = pd.concat([checkThis, theStuff], axis=1)
this = pd.DataFrame(userFile['length'])
thisRightHere = pd.concat([checkThis, this], axis=1)
return thisRightHere
### this function divides the columns by either the total or the length
## you can select either percent 'y' (divide by total)
## or 'n' (divided by length)
# outPut is a dataFrame
def thetopTenDivider(theDataFrame, theDenominator, percent):
df = list(theDataFrame.columns[:-1])
anotherList = []
anotherDF = pd.DataFrame()
for i, x in enumerate(df):
op = theDataFrame[df[i]]
op2 = theDataFrame[theDenominator]
if percent == 'y':
opRatio = (op/op2)*100
else:
opRatio = op/op2
thisRatio = Series(opRatio, name=df[i])
anotherDF = pd.concat((anotherDF, thisRatio), axis=1)
return anotherDF
def otherTopTen(theReferenceData, theData):
thisUnDf = pd.DataFrame()
for gCode in list(theReferenceData.index[:-1]):
thisUnItem = Series(theData.loc[gCode], name=gCode)
thisUnDf = pd.concat((thisUnDf, thisUnItem), axis=1)
return thisUnDf
def dropColAndSum(dataToDropFrom, otherTopTenColsToDrop, lengthOrTotalcolumn, nameOtherObjects):
dropMe = dataToDropFrom.drop(list(otherTopTenColsToDrop.columns))
dropMe = Series(dropMe.drop(lengthOrTotalcolumn).sum(), name=nameOtherObjects)
theOtherTopTen = pd.concat((thisUnDf, dropMe), axis=1)
return theOtherTopTen
if theTask == 1 or 2:
allChart = pd.DataFrame(Series(topTen(analyzeAll).sum(), name='All Samples'))
localChartX2 = pd.DataFrame(Series(topTen(analyzeThis).sum(), name=x2))
topTenSummary = localChartX2.transpose()
allChartSummary = allChart.transpose()
####this section turns top ten items(plus the other objects)
####into a percentage of the whole. by dividing by the Total column
## the data is used for ax1
# outPut is a DataFrame with 1 row and 11 columns
# row name is the location
###this is the output for ax1 of the city report
if theTask == 1 or 2:
x2TopTenSummary = the2ColumnDivider(topTenSummary, 'Total')#<------ax1 inPut
####this is the topten of all the data provided
allLocationsTopTenSummary = the2ColumnDivider(allChartSummary, 'Total')
####this section gives you the density or % values of the user criteria
## the Sorter is generated when the user chooses what category to sort by
## the material is sorted as default
## this gives output for the timeSeries charts
## and the average density of the users criteria and the material %
# outPut is DataFrame grouped by the user criteria
### This is the output for the timeSeries charts
### the output is for figure 2 ax1
if theTask == 1 or 2:
allDensities = theDenseDivider(analyzeAll, 'length', dsort,'n')
densitySource = theDenseDivider(analyzeThis, 'length', dsort, 'n')
densityMat = theDenseDivider(analyzeThis, 'Total', dmat, 'y')
###this is the outPut for the summary density chart (user criteria)
## and the summary material percentage chart
matDensitySummaryLocation = pd.DataFrame(Series(densityMat.mean(), name=x2)).transpose()#<------ax2 inPut
sourceDensitySummaryLocation = pd.DataFrame(Series(densitySource.mean(), name=x2)).transpose()#<-----ax3 inPut
allDensitiesSource = pd.DataFrame(Series(allDensities.mean(), name = 'All Locations')).transpose()
####This section creates the string that displays
## the beach locations in the data acording to the users city selection
# outPut is a string based on the unique values of the index
###this section gets the info for the summary data
# outPut is string for ax5
if theTask == 1 or 2:
nSamples = len(list(analyzeSource.index.get_level_values(0)))#<----- number of samples
averageDensity = sourceDensitySummaryLocation.sum().sum().round(2)#<------ averaged density
totalNumber = analyzeThis['Total'].sum()#------------ total number of pieces
if theTask == 2:
dates = densitySource.index.get_level_values(2)#<------- getting the dates from the index
if theTask == 1:
dates = densitySource.index.get_level_values(0)
pd.to_datetime(dates)
theEarliest = min(dates).date()#<--------- the earliest date
theLatest = max(dates).date()#<------------- the latest date
#else:
# nSamples = len(list(densitySource.index.get_level_values(0)))#<----- number of samples
# averageDensity = densitySource.mean().sum().round(2)#<------ averaged density
# totalNumber = localChartX2.loc['Total'].item()#------------ total number of pieces
# dates = densitySource.index.get_level_values(1)#<------- getting the dates from the index
# theEarliest = min(dates).date()#<--------- the earliest date
# theLatest = max(dates).date()#<---------------- the latest date
###<-------------- ax5 input: summaryData
summaryData = 'Litter density : ' + myCityName
summaryData2 = 'Total number of pieces: ' + str(totalNumber) +'\nAverage density: ' + str(averageDensity) + ' pcs/m' + '\nFirst sample: ' + str(theEarliest) + '\nLast sample : ' + str(theLatest) + '\nNumber of samples: ' + str(nSamples) + '\n'
####this section prints out the items in the categories
## The Categories and the MLW codes are determined by the userCriteria file
# outPut is a string
if task == 1 or 2:
mlwCodeInventory = stringMaker(densitySource, source)#<----- ax6 data
###This section gets the data to make the topten comparison chart
## as opposed to the individual chart this value is given as a density
## The topten of all the data is calculated
## then the Gcodes from that are pulled from the local data
# outPut is two dataFrames with one row and 11columns
### this function gets the top ten density values
## input is allthe data
# outPut is a dataFrame with 11 columns and one row
if task == 1 or 2:
thisUn = pd.DataFrame(thetopTenDivider(analyzeThis, 'length', 'n')).mean()
thatUn = pd.DataFrame(thetopTenDivider(topTenLength(analyzeAll), 'length', 'n')).mean()
### this function takes the index from 'thatUn'
## and pulls the corresponding value from 'thisUn'
# outPut is a dataFrame of the users selection
thisUnDf = otherTopTen(thatUn, thisUn)
### This function caluclates the value of 'Other objects' for 'thisUnDf'
## it drops the columns from the previous function
## sums up whats left and creates a series 'Other objects"
## that series gets appended to thisUnDf
# outPut is a df that will be charted
dropMe = dropColAndSum(thisUn, thisUnDf, 'length', thatUn.index[-1])
#converting to a dataFrame and transposing
thatUn = pd.DataFrame(thatUn)
thatUnOne = thatUn.transpose()
valsLocal = dropMe.iloc[0].values
valsGlobal = thatUnOne.iloc[0].values
theField = analyzeAll.copy()
theField = theField[theField.Total.notnull()]
theField = theField[theField.length < 375]
theField['Dense'] = theField['Total']/theField['length']
theField['Logs'] = theField['Dense'].apply(np.log)
theField = theField[theField.Logs > -4.6]
muField, sigmaField = stats.norm.fit(theField['Logs'])
th = analyzeThis.copy()
th = th[th.Total.notnull()]
th = th[th.length < 375]#<---------- the maximum length values were removed
th['Dense'] = th['Total']/th['length']
th['Logs'] = th['Dense'].apply(np.log)
th = th[th.Logs > -4.6] #<----------- this is equal to 1/100
mu, sigma = stats.norm.fit(th['Logs'])
theZscore = (th['Logs'].mean() - muField)/sigmaField
points = len(theField['Logs'])
In [47]:
fig = plt.figure(figsize=(8.5, 11), frameon=False, edgecolor='000000', linewidth = 1)
rect0 = .65, .71, .1, .24
rect1 = .65, .36, .1, .24
rect2 = .13, .36, .1, .24
rect3 = .23, .7, .06, .06
rect4 = .08, .9, .4, .05
rect5 = .08, .82, .4, .09
rect6 = .13, .07, .4, .195
rect7 = .6, .12, .3, .09
rect8 = .62, .16, .2, .1
rect9 = .08, .66, .3, .02
ax1 = fig.add_axes(rect0)#<-----------x2TopTenSummary
ax2 = fig.add_axes(rect1)#<-----------matDensitySummaryLocation
ax3 = fig.add_axes(rect2)#<-----------sourceDensitySummaryLocation
ax4 = fig.add_axes(rect3)#<-----------performance
ax5 = fig.add_axes(rect4)#<-----------summaryDataTitle
ax8 = fig.add_axes(rect7)#<-----------summaryData
ax6 = fig.add_axes(rect5)#<-----------summaryData2
ax7 = fig.add_axes(rect6)#<----------- comparison chart
ax9 = fig.add_axes(rect8)#<----------- image
colors = ['#003333','#339966', '#66CCCC', '#3399CC', '#993333', '#CC6600', '#FF0033', '#FFCC00', '#3366CC', '#66CC00', '#333300', '#FF0000']
anuther = [ax1, ax2, ax3]
one = x2TopTenSummary.iloc[0].sort_values(ascending=False)
two = matDensitySummaryLocation.iloc[0].sort_values(ascending=False)
three = sourceDensitySummaryLocation.iloc[0].sort_values(ascending=False)
data = [one, two, three]
for t, r in enumerate(data):
blocks = 0.1
width = 0.6
bottom = 0
z = r.index
q = anuther[t]
lables = list(z)
for i, u in enumerate(z):
color = colors[i]
q.bar(blocks, r[u], width=width, bottom = bottom, color = color, label= z[i])
bottom += r[u]
handles, labels = q.get_legend_handles_labels()
q.legend(handles[::-1], labels[::-1], bbox_to_anchor=(1.05, 1), borderaxespad=0., fancybox=False, frameon=False)
if t == 0:
title = 'Top-ten MLW codes'
ylabel = 'Topten objects as a % of total'
q.set_ylim(top=102)
elif t == 1:
title = 'Materials'
ylabel = 'Material as a % of total'
q.set_ylim(top=102)
elif t == 2:
title = 'Density'
ylabel = 'Pieces of trash per ' + unitsLabel
q.set_title(title, size=14, loc='left')
q.set_ylabel(ylabel, size = 12)
q.tick_params(axis='x', labelbottom='off', bottom='off', top='off')
nSamples = len(analyzeAll.loc[x2]['Total'])
if nSamples > 1:
noSample = 'samples'
else:
noSample = 'sample'
q.set_xlabel('n= ' + str(nSamples) + ' ' + noSample, size=12)
q.xaxis.labelpad = 20
percen = stats.percentileofscore(theField['Logs'], th['Logs'].mean()).round(0)
if percen <= 16:
color = 'g'
elif 17 <= percen <= 49:
color = 'y'
elif 50 <= percen < 83:
color = '#FF4500'
elif 84 <= percen:
color = 'r'
zText = 'Density is greater than '
theRest = '%\n of all other locations'
ax4.scatter(1, 1, s=500, color = color)
ax4.set_title('Overall performance')
ax4.set_xlabel(zText + str(percen) + theRest)
ax4.tick_params(axis='both', labelbottom='off', bottom='off', top='off', left='off')
ax4.set_yticklabels([])
ax4.set_xticklabels([])
ax5.text(0, 1, summaryData , size=16, wrap=True, verticalalignment='top')
ax6.text(0, 1, summaryData2, size=14, verticalalignment='top')
ax5.set_axis_off()
ax6.set_axis_off()
theLogs = theField['Logs']
theZscore = (theField['Logs'].mean() - muField)/sigmaField
sixteenth = muField + -1*sigmaField
eightyFourth = muField + 1*sigmaField
ratio = muField/sigmaField
zLessThanNegOne = sixteenth
zLessThanZero = muField
zLessThanOne = eightyFourth
def logNormalCurve(theData, numberOfpoints):
xmin = min(theData)
xmax = max(theData)
x = np.linspace(xmin, xmax, numberOfpoints)
pdf = stats.norm.pdf(x, loc=muField, scale=sigmaField)
return [x, pdf, xmin, xmax]
x = logNormalCurve(theLogs, points)[0]
pdf = logNormalCurve(theLogs, points)[1]
xmin = logNormalCurve(theLogs, points)[2]
xmax = logNormalCurve(theLogs, points)[3]
px = th['Logs'].mean()
py = stats.norm.pdf(px, loc=muField, scale=sigmaField)
ax7.plot(x, pdf, 'k', linestyle='dashed', color='b', alpha=1)
ax7.fill_between(x, stats.norm.pdf(x, loc=muField, scale=sigmaField), where=(x <= px), color='g', alpha= 0.3)
ax7.vlines(px, 0, stats.norm.pdf(muField, loc=muField, scale=sigmaField), color='b', linestyle ='-.')
ax7.scatter(px, py, s=200, color = 'r')
ax7.spines['bottom'].set_position('zero')
ax7.set_title('The distibtuion of all values and this location')
ax7.set_xlabel('The ln of litter density', size=14)
today = date.today()
closingText = 'This report printed on ' + str(today) + '\nCreated using python 3.62\n Math, trash and water\n www.hammerdirt.ch'
ax8.text(0, 0, closingText , size=12, wrap=True, verticalalignment='top')
ax8.set_axis_off()
im = image.imread(imageFile)
ax9.imshow(im, aspect='auto', zorder=-1)
ax9.set_axis_off()
from matplotlib.backends.backend_pdf import PdfPages
plt.savefig('thisImage.png', bbox_inches = 'tight')
saveDocString = str(folder) + str(x2) + '_litter_Density.pdf'
pp = PdfPages(saveDocString)
fig.savefig(pp, format='pdf')
pp.close()
plt.show()
plt.close()
In [25]:
fig = plt.figure(figsize=(8.5, 11), frameon=False, edgecolor='000000', linewidth = 1)
thisX = pd.DataFrame(source.sum())
otaysiix = stringMaker(thisX, source)
idx = pd.IndexSlice
totals = analyzeAll['Total']
total = totals.sum().round(0)
theDates = analyzeAll.index.get_level_values(1)
pd.to_datetime(dates)
firstDate = min(theDates).date()
lastDate = max(theDates).date()
numberSamples = len(totals)
yourChoice = pd.Series(cityChoices)
thisString = ''
for i, x in enumerate(yourChoice):
pos = str(i)
city = x
string = pos + ' - ' + x + '\n'
thisString = thisString + string
#print(thisString)
where = '\nIncludes data from the following locations:\n' + '\n'+ thisString
howManyTimes = '\nTotal number of samples: ' + str(numberSamples)
howMany = '\nTotal number of pieces: ' + str(total)
avDensity = '\nAverage density: ' + str(allDensitiesSource.mean().sum().round(1))
first = '\nFirst sample: ' + str(firstDate)
last = '\nLast sample: ' + str(lastDate)
rect8 = .62, .16, .2, .1
rect9 = .08, .66, .3, .02
rect0 = 0.05, .7, .45, .25
rect1 = .3, .7, .18, .3
rect2 = .64, .2, .25, .75
rect3 = 0, 0, .7, .34
rect4 = 0.18, 0.4, .1, .22
rect5 = 0.17, 0.09, .1, .22
rect6 = .6, .12, .3, .09
rect7 = .62, .14, .2, .08
#gridSize = gridspec.GridSpec(24, 8)
ax1 = fig.add_axes(rect0)#, rowspan=10, colspan=2)
#ax2 = fig.add_axes(rect1)#, rowspan=10, colspan=2)
ax3 = fig.add_axes(rect2)
#ax5 = fig.add_subplot(gridSize[1:2, 7:])
#ax4 = fig.add_axes(rect3)
ax5 = fig.add_axes(rect4)
ax6 = fig.add_axes(rect5)
ax7 = fig.add_axes(rect6)
#ax8 = fig.add_axes(rect7)
ax1.text(0,1, where + howManyTimes + howMany + avDensity + first + last,verticalalignment='top', size=12)
ax1.set_title('Summary information for all data', loc='left', size=14)
#ax2.text(0,1, 'ax2')
ax3.text(0, 1, otaysiix , size=10, wrap=True, verticalalignment='top')
ax3.set_title('Items by category', loc='right', size=14)
ax4.text(0,1, 'ax4')
anuther = [ax5, ax6]
one = x2TopTenSummary.iloc[0].sort_values(ascending=False)
two = matDensitySummaryLocation.iloc[0].sort_values(ascending=False)
three = sourceDensitySummaryLocation.iloc[0].sort_values(ascending=False)
oneAll = allLocationsTopTenSummary.iloc[0].sort_values(ascending=False)
twoAll = allDensitiesSource.iloc[0].sort_values(ascending=False)
#threeAll=
data = [oneAll, twoAll]
nSamples = len(analyzeAll['Total'])
for t, r in enumerate(data):
blocks = 0.1
width = 0.6
bottom = 0
z = r.index
q = anuther[t]
lables = list(z)
for i, u in enumerate(z):
color = colors[i]
q.bar(blocks, r[u], width=width, bottom = bottom, color = color, label= z[i])
bottom += r[u]
handles, labels = q.get_legend_handles_labels()
q.legend(handles[::-1], labels[::-1], bbox_to_anchor=(1.05, 1), borderaxespad=0., fancybox=False, frameon=False)
if t == 0:
title = 'Top-ten MLW codes'
ylabel = 'Topten objects as a % of total'
elif t == 1:
title = 'Density'
ylabel = 'Pieces of trash per ' + unitsLabel
elif t == 2:
title = 'Density'
ylabel = 'Pieces of trash per ' + unitsLabel
q.set_title(title, size=14, loc='left')
q.set_ylabel(ylabel, size = 12)
q.tick_params(axis='x', labelbottom='off', bottom='off', top='off')
if nSamples > 1:
noSample = 'samples'
else:
noSample = 'sample'
q.set_xlabel('n= ' + str(nSamples) + ' ' + noSample, size=12)
q.xaxis.labelpad = 20
today = date.today()
closingText = 'This report printed on ' + str(today) + '\nCreated using python 3.62\n Math, trash and water\n www.hammerdirt.ch'
ax7.text(0, 0, closingText , size=12, wrap=True, verticalalignment='top')
ax7.set_axis_off()
#im = image.imread(imageFile)
#ax8.imshow(im, aspect='auto', zorder=-1)
#ax8.set_axis_off()
ax3.set_axis_off()
ax4.set_axis_off()
ax1.set_axis_off()
saveDocString = str(folder) +'summary.pdf'
pp = PdfPages(saveDocString)
fig.savefig(pp, format='pdf')
pp.close()
plt.show()
plt.close()
In [163]:
gh = cityChoices
#densitySource = theDenseDivider(analyzeThis, 'length', dsort, 'n')#<-----ax3 inPut
#pd.DataFrame(Series(densitySource.mean(), name=x2)).transpose()#<-----ax3 inPut
antelopeDensity = theDenseDivider(analyzeAll.loc[gh[2]], 'length', dsort, 'n')
antelopeDensity = pd.DataFrame(Series(antelopeDensity.mean(), name=gh[2]))
theDataList = []
for i in cityChoices:
theDensity = theDenseDivider(analyzeAll.loc[i], 'length', dsort, 'n')
theDensity = pd.DataFrame(Series(theDensity.mean().sort_values(ascending=False), name=i))
theDataList.append(theDensity)
In [178]:
#pd.DataFrame(Series(topTen(analyzeThis).sum(), name=x2))
#topTenSummary = localChartX2.transpose()
#x2TopTenSummary = the2ColumnDivider(topTenSummary, 'Total')
theTopTenList = []
for i in cityChoices:
theTop = pd.DataFrame(Series(topTen(analyzeAll.loc[i]).sum().sort_values(ascending=False), name=i))
theTop = theTop.transpose()
theTop = the2ColumnDivider(theTop, 'Total')
theTop.drop(['Total'], axis=1, inplace=True)
theTopTenList.append(theTop)
theTopTenList[0].index.values[0]
Out[178]:
In [32]:
from matplotlib.gridspec import GridSpec
theDensityList = []
theHighAndLow = []
for i in cityChoices:
theDensity = theDenseDivider(analyzeAll.loc[i], 'length', dsort, 'n')
theDensity = pd.DataFrame(Series(theDensity.mean(), name=i)).transpose()
aTotal = theDensity.sum().sum()
theHighAndLow.append(aTotal)
theDensityList.append(theDensity)
max(theHighAndLow)
theTopTenList = []
for i in cityChoices:
theTop = pd.DataFrame(Series(topTen(analyzeAll.loc[i]).sum().sort_values(ascending=False), name=i))
theTop = theTop.transpose()
theTop = the2ColumnDivider(theTop, 'Total')
theTop.drop(['Total'], axis=1, inplace=True)
theTopTenList.append(theTop)
colors = ['#003333','#339966', '#66CCCC', '#3399CC', '#993333', '#CC6600', '#FF0033', '#FFCC00', '#3366CC', '#66CC00', '#333300', '#FF0000']
Fig = plt.figure(figsize=(8.5, 11))
plt.subplots_adjust(left=None, bottom=None, right=None, top=None,
wspace=.9, hspace=.5)
gs = GridSpec(3, 5)
ax1 = plt.subplot(gs[0, 0])
ax2 = plt.subplot(gs[0, 1])
ax3 = plt.subplot(gs[0, 2])
ax4 = plt.subplot(gs[0,3])
ax5 = plt.subplot(gs[1,0])
ax6 = plt.subplot(gs[1,1])
ax7 = plt.subplot(gs[1,2])
ax8 = plt.subplot(gs[1,3])
ax9 = plt.subplot(gs[2:, 0:3])
ax10 = plt.subplot(gs[2:, 3:])
topTenRow = [ax1, ax2, ax3, ax4]
densityRow = [ax5, ax6, ax7, ax8]
def charter(data, axes, listName):
for t, r in enumerate(data):
blocks = 0.1
width = 0.6
bottom = 0
z = r.columns
q = axes[t]
lables = list(z)
for i, u in enumerate(z):
color = colors[i]
q.bar(blocks, r[u], width=width, bottom = bottom, color = color, label= z[i])
bottom += r[u]
title = r.index.values[0]
q.set_title(title, size=11, loc='left')
q.tick_params(axis='x', labelbottom='off', bottom='off', top='off')
q.xaxis.labelpad = 10
nSamples = len(analyzeAll.loc[title]['Total'])
if nSamples > 1:
noSample = 'samples'
else:
noSample = 'sample'
q.set_xlabel('n= ' + str(nSamples) + ' ' + noSample, size=12)
if t == 0:
if listName == 'topTenList':
ylabel = 'Top-ten items as % of total'
q.set_ylabel(ylabel, size = 12)
elif listName == 'densityList':
ylabel = 'Pieces of trash per meter'
q.set_ylabel(ylabel, size = 12)
q.set_ylim(ymax=max(theHighAndLow))
if listName == 'densityList':
q.set_ylim(ymax=max(theHighAndLow))
handles, labels = q.get_legend_handles_labels()
q.legend(handles[::-1], labels[::-1], bbox_to_anchor=(1.4, 1), borderaxespad=0.2, fancybox=False, frameon=False)
charter(theTopTenList, topTenRow, 'topTenList')
charter(theDensityList, densityRow, 'densityList' )
plt.suptitle("Top ten items as % of total, density by category and distribution of results", size=14)
theField = analyzeAll.copy()
theField = theField[theField.Total.notnull()]
theField = theField[theField.length < 375]
theField['Dense'] = theField['Total']/theField['length']
theField['Logs'] = theField['Dense'].apply(np.log)
theField = theField[theField.Logs > -4.6]
muField, sigmaField = stats.norm.fit(theField['Logs'])
theLogs = theField['Logs']
#px1 = thefield[cityChoices[0]]['Logs'].mean()
theZscore = (theField['Logs'].mean() - muField)/sigmaField
sixteenth = muField + -1*sigmaField
eightyFourth = muField + 1*sigmaField
ratio = muField/sigmaField
zLessThanNegOne = sixteenth
zLessThanZero = muField
zLessThanOne = eightyFourth
px=[]
for name in cityChoices:
thisPoint = theField.loc[name]['Logs'].mean()
px.append(thisPoint)
def logNormalCurve(theData, numberOfpoints):
xmin = min(theData)
xmax = max(theData)
x = np.linspace(xmin, xmax, numberOfpoints)
pdf = stats.norm.pdf(x, loc=muField, scale=sigmaField)
return [x, pdf, xmin, xmax]
x = logNormalCurve(theLogs, points)[0]
pdf = logNormalCurve(theLogs, points)[1]
xmin = logNormalCurve(theLogs, points)[2]
xmax = logNormalCurve(theLogs, points)[3]
#px = th['Logs'].mean()
#py = stats.norm.pdf(px, loc=muField, scale=sigmaField)
ax9.plot(x, pdf, 'k', linestyle='dashed', color='b', alpha=1)
#ax9.fill_between(x, stats.norm.pdf(x, loc=muField, scale=sigmaField), where=(x <= px), color='g', alpha= 0.3)
#ax9.vlines(px, 0, stats.norm.pdf(muField, loc=muField, scale=sigmaField), color='b', linestyle ='-.')
ax9.scatter(px[0],stats.norm.pdf(px[0], loc=muField, scale=sigmaField), s=200, color = 'r')
ax9.scatter(px[1], stats.norm.pdf(px[1], loc=muField, scale=sigmaField), s=200, color = 'r')
ax9.scatter(px[2], stats.norm.pdf(px[2], loc=muField, scale=sigmaField), s=200, color = 'r')
#ax9.scatter(px[3], stats.norm.pdf(px[3], loc=muField, scale=sigmaField), s=200, color = 'r')
ax9.spines['bottom'].set_position('zero')
ax9.set_title('The log normal distribution of litter densities')
ax9.set_xlabel('The ln(litter density), Cordova Creek', size=14)
ax9.set_ylim(ymax=1)
im = image.imread(imageFile)
closingText = 'This report printed on ' + str(today) + '\nCreated using python 3.62\nMath, trash and water\nwww.hammerdirt.ch'
ax10.text(0, 0, closingText, size=12)
ax10.set_axis_off()
px1 = px[0]
py1 = stats.norm.pdf(px[0], loc=muField, scale=sigmaField)
px2 = px[1]
py2 = stats.norm.pdf(px[1], loc=muField, scale=sigmaField)
px3 = px[2]
py3 = stats.norm.pdf(px[2], loc=muField, scale=sigmaField)
px4 = px[3]
py4 = stats.norm.pdf(px[3], loc=muField, scale=sigmaField)
ax9.annotate('Arcade Creek',
xy=(px1, py1), xycoords='data',
xytext=(0, 20), textcoords='offset points',
fontsize = 12, horizontalalignment='center', verticalalignment='bottom')
ax9.annotate('Cordova Creek II',
xy=(px2, py2), xycoords='data',
xytext=(0, 20), textcoords='offset points',
fontsize = 12, horizontalalignment='center', verticalalignment='bottom')
ax9.annotate('Antelope',
xy=(px3,py3), xycoords='data',
xytext=(0, 20), textcoords='offset points',
fontsize = 12, horizontalalignment='center', verticalalignment='bottom')
saveDocString = str(folder) +'ataglance.pdf'
pp = PdfPages(saveDocString)
Fig.savefig(pp, format='pdf')
pp.close()
plt.show()
plt.close()
In [ ]: