notebook.community

Edit and run



In [33]:

    
import os
import pandas as pd
import matplotlib
import numpy as np
from pandas import Series, DataFrame
import csv
from scipy import stats
import matplotlib.pyplot as plt
import random
import matplotlib.colors as colors
from datetime import date
from datetime import time
from datetime import datetime
import sys
import matplotlib.image as image
folder =[]
thisq = os.getcwd()
thisx = os.listdir(thisq)
for item in thisx:
    if not os.path.isfile(os.path.join(thisq, item)):#<-------This here checks if items are files
        folder.append(item)#<--------- if not file append to the empty list
print('\nHere is a list of folders in the current directory:\n')   
print(folder)









    



Here is a list of folders in the current directory:

['criteriaFiles', 'mysite', 'Pictures', 'charts', 'arcgis-python-api-1.2.0', '.dbus', '.config', '.gconf', 'Music', '.jupyter', 'calData', '.ipython', 'Public', '.thumbnails', 'Documents', 'Desktop', 'Templates', 'mcbpData', 'slRReports', 'anaconda3', '.cache', '.conda', '.local', 'Downloads', '.hardinfo', 'github', '__pycache__', '.mozilla', 'Videos', '.atom', 'test', 'tryIn.shp', '.ipynb_checkpoints', '.pki']



In [34]:

    
d1 = pd.read_csv('calData/cWeek1.csv')
d2 = pd.read_csv('calData/cCreek2.csv')
d1.drop('Total', inplace=True, axis=1)
d2.drop('Total', inplace=True, axis=1)



In [35]:

    
def colDrop(userFile):
    lessThan = []
    #this gets rid of the empty columns
    for p in list(userFile.columns[3:]):
        b = userFile[p].sum()
        if b == 0:
            lessThan.append(p)
    userFile = userFile.drop(lessThan, axis=1)
    #use the total to check for empty rows
    userFile['Total'] = userFile.iloc[:, 3:].sum(axis=1)
    userFile = userFile[userFile.Total > 0]
    #userFile.reset_index(drop=True, inplace=True)
    #userFile.drop(['Total'], axis=1, inplace=True)
    #set the date column
    userFile['Date'] = pd.to_datetime(userFile['Date'])
    return userFile
d1 = colDrop(d1).copy()
d2 = colDrop(d2).copy()



In [36]:

    
d1['Location'][2] = 'Antelope'
d1.columns
d1.set_index(['Location', 'Date'],inplace=True)
d2.set_index(['Location', 'Date'], inplace=True)









    



/home/manwithshovel/anaconda3/envs/hammer/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.



In [37]:

    
import re
justGd1 = d1.select(lambda x: re.search('G\d+', x), axis=1)
justGd1.insert(0, 'Length', d1['Length'])
justGd2 = d2.select(lambda x: re.search('G\d+', x), axis=1)
justGd2.insert(0, 'Length', d2['Length'])
myFrames = [justGd1, justGd2]
joined = pd.concat(myFrames, axis=0, join='outer', copy=True)
joined.fillna(0, inplace=True)
joined.insert(0, 'Lengths', joined['Length'], allow_duplicates=False)
joined.drop(['Length'], axis=1, inplace=True)
joined=joined.rename(columns = {'Lengths':'length'})
joined.reset_index(level=1, drop=False, inplace=True)
joined.reset_index(drop=False, inplace=True)
joined.to_csv('calData/allData.csv', index_label = False, index=False)



In [44]:

    
import os
import pandas as pd
import matplotlib
import numpy as np
from pandas import Series, DataFrame
import csv
from scipy import stats
import matplotlib.pyplot as plt
import random
import matplotlib.colors as colors
from datetime import date
from datetime import time
from datetime import datetime
import sys
import matplotlib.image as image


# coding: utf-8

# this script is designed to take the data collected from MLW surveys and generate automated reports

task = ['0 - Create Summary Report for Project', '1 - Create Summary Report for city', '2 - Create Summary Report for Beach']
print(task)
theTask = int(input('''\n\nWith this script you can do prepare one of three reports.
your data needs to be in the correct format and located in the curent working directory
please choose one of the three options listed (the number) '''))

userCSV = 'calData/allData.csv' 
#input('\nIf your file meets these criteria, please enter the complete file name and path here: ')
criteria = 'criteriaFiles/newCriteria.csv'
#input('''\nThis script sorts records by city, beach and date. The records are then assigned criteria.
#please enter the complete file name and path of the criteria file here: ''')

imageFile = 'weblogo.jpg' 
#input('What is the name of the image file you want to use for your logo? (6cm X 3cm): ')

userFile = pd.read_csv(userCSV, encoding='1252')
userCols = userFile.columns
if 'Unnamed: 0' in userCols:
    userFile.drop(['Unnamed: 0'], axis=1, inplace=True)

thisIsmyDateColumn = 'Date'
#input('\nPlease specify(spell out) the name of the column that has the date criteria: ')

def saveTo():
    folder =[]
    thisq = os.getcwd()
    thisx = os.listdir(thisq)
    for item in thisx:
        if not os.path.isfile(os.path.join(thisq, item)):#<-------This here checks if items are files
            folder.append(item)#<--------- if not file append to the empty list
    print('\nHere is a list of folders in the current directory:\n')   
    print(folder)
    aLocation = input('''\n\nPlease indentify the folder where you would
    like the report and other output data to be saved (folder name) : ''')
    afolder = aLocation + '/'
    return afolder
folder = saveTo()

###getting rid of columns with no values:
## source: hammerdirt
def colDrop(userFile):
    saveString = folder + 'yourData_noZeroes_plusDaily_Total.csv'
    lessThan = []
    #this gets rid of the empty columns
    for p in list(userFile.columns[3:]):
        b = userFile[p].sum()
        if b == 0:
            lessThan.append(p)
    userFile.drop(lessThan, axis=1, inplace=True)
    #use the total to check for empty rows
    userFile['Total'] = userFile.iloc[:, 3:].sum(axis=1)
    charts3 = userFile.copy()
    charts3.to_csv(saveString)
    #get rid of empty rows and put the file back to orignal state
    userFile = userFile[userFile.Total > 0]
    userFile.reset_index(drop=True, inplace=True)
    userFile.drop(['Total'], axis=1, inplace=True)
    #set the date column
    userFile[thisIsmyDateColumn] = pd.to_datetime(userFile[thisIsmyDateColumn])
    return userFile 

userFile = colDrop(userFile)









    



['0 - Create Summary Report for Project', '1 - Create Summary Report for city', '2 - Create Summary Report for Beach']


With this script you can do prepare one of three reports.
your data needs to be in the correct format and located in the curent working directory
please choose one of the three options listed (the number) 1

Here is a list of folders in the current directory:

['criteriaFiles', 'mysite', 'Pictures', 'charts', 'arcgis-python-api-1.2.0', '.dbus', '.config', '.gconf', 'Music', '.jupyter', 'calData', '.ipython', 'Public', '.thumbnails', 'Documents', 'Desktop', 'Templates', 'mcbpData', 'slRReports', 'anaconda3', '.cache', '.conda', '.local', 'Downloads', '.hardinfo', 'github', '__pycache__', '.mozilla', 'Videos', '.atom', 'test', 'tryIn.shp', '.ipynb_checkpoints', '.pki']


Please indentify the folder where you would
    like the report and other output data to be saved (folder name) : calData



In [45]:

    
changeColumnNames = input('''\nThe first columns of the data file should be "Location". The default
for this script is and "Location".  The column names are used to set the index and define 
how the data is sorted geographicaly. If you have formatted the file to the default values you can skip the next three
questions. Skip? (y or n): ''')

if changeColumnNames == 'y':
    #cityLabel = 'Commune'
    beachLabel = 'Location'
    #lengthLabel = 'length'
else:
    cityLabel = input('\nPlease verify the name of the column name that holds the city names: ')
    beachLabel = input('\nPleae verify the name of the column that holds the beach names: ')
    lengthLabel = input('\nPlease verify the name of the column that holds the beach length: ')


#thisIsMyCity = cityLabel
thisIsMyBeach = beachLabel
#thisIsMyLength = lengthLabel

myIndex = [thisIsMyBeach , thisIsmyDateColumn]
    
print(['0 - meters', '1 - feet'])

unitsLabel = input('\nIs this in meters or feet? ')

if unitsLabel == 'meters':
    units = 'meters'
else:
    units = 'foot'

print('Units will be abbreviated like this: ' + str(units))




print('\nThe MLW codes in the criteria file are being compared and matched up with the codes in your data file')

def userCriteria(csvFile):
    s1 = userFile.iloc[0:1, 4:]
    s2 = s1.to_dict('list')
    d1 = pd.read_csv(csvFile)
    cols = d1.columns
    if 'Unnamed: 0' in cols:
        d1.drop(['Unnamed: 0'], axis=1, inplace=True)
    #d1.set_index(['Type'], inplace=True)
    d2 = d1.to_dict('list')
    thisList = {}#<--------- an empty list to store our dcitionary in
    # iterate through all the G codes in the user data
    for i, x in enumerate(s2.keys()): #<------ Just looking at the dict keys
    # compare those to the G codes that are in the new criteria
        for y in (d2.keys()):
            # every G code provided by the user
            # is compared to the G code in the criteria list
            # so if a code in s2 is in d2 the following happens
            if x == y:
                # x = 'Some Gcode' from the list such as 'G23'
                # if a match is found
                # the G code is matched to the new criteria
                q = d2[x] #<----------- This picks up the value from the key, value pair in d2
                # stored in a dict
                new = {x: q} #<------- the Gcode from s2(aka x) picks up the value fromt the corresponding code in d2
                # that dict is put in a list
                thisList.update(new)
    ds = pd.DataFrame(thisList)
    saveString = str(folder) + 'your_Criteria.csv'
    ds.to_csv(saveString)
    print('\nThe criteria has been matched to your data, the file is stored under ' +  str(saveString))
    return ds

ds = userCriteria(criteria) 
w = len(list(ds.columns))
print('\nThere are ' + str(w) + ' different MLW codes in your data.')        



userCols = userFile.columns       


def sortIt():
    print('''\nThe contents of the criteria file need to be named.
    Please provide a one word description for the contents of each row.
    The minimum size criteria file is three rows plus the column headers
    -one row for your specific criteria
    -one row for a description of the MLW code
    -one row for the material\n''')
    
    print('''If the order and names of the rows in your criteria file are:
    \nRow one : Material
    \nRow two : Description
    \nRow three : Source
    \nOr if you are using the criteria file provided you can skip the next three questions''')
    skip = input('Skip the next three questions?(y/n) ')
    if skip == 'y':
        firstRow = 'Material'
        secondRow = 'Description'
        thirdRow = 'Source'
        ds.insert(0, 'Code', [firstRow, secondRow, thirdRow])
        ds.set_index('Code', inplace=True, drop=True)
    else:
        firstRow = input('Please provide a one word desription for the contents of the first row: \n')
        secondRow = input('\nTitle for the second row: ')
        thirdRow = input('\nTitle for the third row: ')    
        ds.insert(0, 'Code', [firstRow, secondRow, thirdRow])
        ds.set_index('Code', inplace=True, drop=True)
    mySort = input('\nWhich one would you like to use to group your data? ')
    myMat = input('\nWhich one is the material type? ')
    dSortx = ds.loc[mySort]
    dMatx = ds.loc[myMat]
    dsort = dSortx.to_dict()
    dmat = dMatx.to_dict()
    return [dsort, dmat]

dsort, dmat = sortIt()

userFile.set_index(myIndex, inplace=True)

print('''At the end of the this script you can do other analysis on the formatted data
by calling 'userFile', it is a pandas DataFrame with the index set to City, Location, Date''')

# use the dict to create a groupby instance
# on the original data frame

source = userFile.groupby(dsort, axis=1)
mat = userFile.groupby(dmat, axis=1)

# here is the data that will be charted
oneMore = pd.DataFrame(source.sum())
oneMat = pd.DataFrame(mat.sum())
oneMat['Total'] = oneMat.iloc[:, 1:].sum(axis=1)
oneMore['Total'] = oneMore.iloc[:, 1:].sum(axis=1)
userFile['Total'] = userFile.iloc[:, 1:].sum(axis=1)

oneMore.to_csv('your_Data_sorted.csv')
print('\nYour data has been sorted and is ready to be presented based on your criteria\n')
print('\nThere is a copy stored locally under "your_Data_sorted.csv"\n')

      
x1 = userFile.index.values

def createChoices(indexValues):
    cList = []
    lList = []
    #iterate through the list of tuples
    for c, l in indexValues:
        #use the position to seperate
        cList.append(c)
        lList.append(l)
    #remove duplcates and save in seperate lists
    levelOne = [i for n, i in enumerate(cList) if i not in cList[:n]]
    levelTwo = [i for n, i in enumerate(lList) if i not in lList[:n]]
    return [levelOne, levelTwo]
theChoices = createChoices(x1)
cityChoices = theChoices[0]
beachChoices = theChoices[1]

userFile.sort_index(inplace=True)
oneMore.sort_index(inplace=True)
oneMat.sort_index(inplace=True)
idx = pd.IndexSlice
if theTask == 1:
    yourChoice = pd.Series(cityChoices)
    print('\nThe report will be based on one of the items on this list:')
    for i, x in enumerate(yourChoice):
        print(str(i) +' - '+ str(x), sep=', ')
    userChoice = int(input('\nenter the index number of the city of your choice : ' ))
    myCityName = input('\nCan you confirm the name of your choice?\nThis will be included in report and chart titles: ')
    x2= cityChoices[userChoice]
    analyzeAll = userFile.copy()
    analyzeThis = userFile.loc[x2].copy()
    analyzeSource = oneMore.loc[x2].copy()
    analyzeMat = oneMat.loc[x2].copy()
elif theTask == 2:
    yourChoice = pd.Series(beachChoices)
    print('\nThe report will be based on one of the items on this list:')
    for i, x in enumerate(yourChoice):
        print(str(i) +' - '+ str(x), sep=', ')
    userChoice = int(input('\nenter the index number of your choice : ' ))
    myCityName = input('\nCan you confirm the name of your choice?\nThis will be included in report and chart titles: ')
    x2= beachChoices[userChoice]
    analyzeAll = userFile.copy()
    analyzeThis = userFile.loc[idx[:,[x2]], :].copy()
    analyzeSource = oneMore.loc[idx[:,[x2]], :].copy()
    analyzeMAt = oneMat.loc[idx[:,[x2]], :].copy()
elif theTask == 0:
    myReportName = input('''You have chosen a summary report for all the data.
    Can you give a name for the report, it will be used in chart titles: ''')
    analyzeAll = userFile.copy()
    analyzeSource = oneMore.copy()
    analyzeMAt = oneMat.copy()
print('''\nThis module outputs several DataFrames with the same index:
- userFile (data cleaned up)
- oneMore(data sorted by user criteria)
- oneMat(data sorted by material)''')        

####This gets the topTen  values from the data
## Prepares the data to be used in ax1
# outPut is a DataFrame with 12 columns including a total column
# the data is selected by the location specified by the user









    



The first columns of the data file should be "Location". The default
for this script is and "Location".  The column names are used to set the index and define 
how the data is sorted geographicaly. If you have formatted the file to the default values you can skip the next three
questions. Skip? (y or n): y
['0 - meters', '1 - feet']

Is this in meters or feet? meters
Units will be abbreviated like this: meters

The MLW codes in the criteria file are being compared and matched up with the codes in your data file

The criteria has been matched to your data, the file is stored under calData/your_Criteria.csv

There are 108 different MLW codes in your data.

The contents of the criteria file need to be named.
    Please provide a one word description for the contents of each row.
    The minimum size criteria file is three rows plus the column headers
    -one row for your specific criteria
    -one row for a description of the MLW code
    -one row for the material

If the order and names of the rows in your criteria file are:
    
Row one : Material
    
Row two : Description
    
Row three : Source
    
Or if you are using the criteria file provided you can skip the next three questions
Skip the next three questions?(y/n) y

Which one would you like to use to group your data? Source

Which one is the material type? Material
At the end of the this script you can do other analysis on the formatted data
by calling 'userFile', it is a pandas DataFrame with the index set to City, Location, Date

Your data has been sorted and is ready to be presented based on your criteria


There is a copy stored locally under "your_Data_sorted.csv"


The report will be based on one of the items on this list:
0 - Arcade Creek
1 - Cordova Creek II
2 - Antelope
3 - Cordova Creek

enter the index number of the city of your choice : 0

Can you confirm the name of your choice?
This will be included in report and chart titles: Arcade Creek

This module outputs several DataFrames with the same index:
- userFile (data cleaned up)
- oneMore(data sorted by user criteria)
- oneMat(data sorted by material)



In [46]:

    
def topTen(file):
    k = list(file.columns)
    l = Series(file[k[1:-1]].sum(), name='Lake')
    l = l.sort_values(ascending=False)
    tenx = l.index.values[:10]
    elevenX = l.index.values[10:]
    checkThis = pd.DataFrame(file[tenx])
    theStuff = Series(file[elevenX].sum(axis=1), name='All other objects')
    checkThis = pd.concat([checkThis, theStuff], axis=1)
    this = pd.DataFrame(file['Total'])
    thisRightHere = pd.concat([checkThis, this], axis=1)
    return thisRightHere

####this section turns top ten items(plus the other objects)
####into a percentage of the whole. by dividing by the Total column
## the data is used for ax1
# outPut is a DataFrame with 1 row and 11 columns
# row name is the location
def the2ColumnDivider(thedataframe, theDenominator):
    df = list(thedataframe.columns[:-1])
    anotherList = []
    anotherDF = pd.DataFrame()
    for i, x in enumerate(df):
        op = thedataframe[df[i]]
        op2 = thedataframe[theDenominator]
        opRatio = (op/op2)*100
        thisRatio = Series(opRatio, name=df[i])
        anotherDF = pd.concat((anotherDF, thisRatio), axis=1)  
    return anotherDF

def theDenseDivider(theDataFrame, theDenominator, theSorter, percent):
    df = list(theDataFrame.columns[1:])
    anotherList = []
    anotherDF = pd.DataFrame()
    for i, x in enumerate(df):
        op = theDataFrame[df[i]]
        op2 = theDataFrame[theDenominator]
        if percent == 'y':
            opRatio = (op/op2)*100
        else:
            opRatio = op/op2
        thisRatio = Series(opRatio, name=df[i])
        anotherDF = pd.concat((anotherDF, thisRatio), axis=1)
    densityGroup = anotherDF.groupby(theSorter, axis=1)
    sourceDensity = pd.DataFrame(densityGroup.sum())
    return sourceDensity




def theLocationBox(dataToBeCharted, string):
    hereAgain = []
    theBeachNames = stringx +':\n\n  '
    thePlaces = dataToBeCharted.index.get_level_values(0)
    theLister = []
    for spot in thePlaces:
        if spot not in theLister:
            theLister.append(spot)
    for i, beach in enumerate(theLister):
        if i < len(theLister) - 1:
            theBeachNames = theBeachNames + ' ' + theLister[i] + '\n  '
        if i == len(theLister) -1:
            theBeachNames = theBeachNames + ' ' + theLister[i] + '\n  '
    hereAgain.append(theBeachNames)
    anotherString = ''
    for p, q in enumerate(hereAgain):
        anotherString = anotherString + hereAgain[p] + '\n'
    return anotherString

#if theTask == 1 or 2:
    #if theTask == 1:
        #stringx = 'The locations in '
        #myLocations = theLocationBox(densitySource, stringx) #<------ ax4 inPut
    #else:
        #stringx = 'This site is located in '
        #myLocations = theLocationBox(densitySource, stringx) #<------ ax4 inPut


def stringMaker(columnSource, sourceColumns):
    thisList = columnSource.columns
    wordsList = []
    for i, name in enumerate(thisList):
        gun = list(sourceColumns.get_group(thisList[i]).columns)
        thisString = 'The MlW codes included in ' + name + ':\n'
        for i, code in enumerate(gun):
            if i < len(gun) - 1:
                if i%4 != 0:
                    thisString = thisString + gun[i] + ', '
                elif i%4 == 0 and i != 0:
                    thisString = thisString + gun[i] + '\n'
            elif i == len(gun) -1:
                thisString = thisString + gun[i] + '\n'
        wordsList.append(thisString)
        anotherString = ''
    for i, q in enumerate(wordsList):
        anotherString = anotherString + '\n' + wordsList[i]
    return anotherString


def topTenLength(file):
    k = list(file.columns)
    l = Series(file[k[1:-1]].sum(), name='Lake')
    l = l.sort_values(ascending=False)
    tenx = l.index.values[:10]
    elevenX = l.index.values[10:]
    checkThis = pd.DataFrame(file[tenx])
    theStuff = Series(file[elevenX].sum(axis=1), name='Other')
    checkThis = pd.concat([checkThis, theStuff], axis=1)
    this = pd.DataFrame(userFile['length'])
    thisRightHere = pd.concat([checkThis, this], axis=1)
    return thisRightHere
### this function divides the columns by either the total or the length
## you can select either percent 'y' (divide by total)
## or 'n' (divided by length)
# outPut is a dataFrame

def thetopTenDivider(theDataFrame, theDenominator, percent):
    df = list(theDataFrame.columns[:-1])
    anotherList = []
    anotherDF = pd.DataFrame()
    for i, x in enumerate(df):
        op = theDataFrame[df[i]]
        op2 = theDataFrame[theDenominator]
        if percent == 'y':
            opRatio = (op/op2)*100
        else:
            opRatio = op/op2
        thisRatio = Series(opRatio, name=df[i])
        anotherDF = pd.concat((anotherDF, thisRatio), axis=1)
    return anotherDF

def otherTopTen(theReferenceData, theData):
    thisUnDf = pd.DataFrame()
    for gCode in list(theReferenceData.index[:-1]):
        thisUnItem = Series(theData.loc[gCode], name=gCode)
        thisUnDf = pd.concat((thisUnDf, thisUnItem), axis=1)
    return thisUnDf

def dropColAndSum(dataToDropFrom, otherTopTenColsToDrop, lengthOrTotalcolumn, nameOtherObjects):
    dropMe = dataToDropFrom.drop(list(otherTopTenColsToDrop.columns))
    dropMe = Series(dropMe.drop(lengthOrTotalcolumn).sum(), name=nameOtherObjects)
    theOtherTopTen = pd.concat((thisUnDf, dropMe), axis=1)
    return theOtherTopTen


if theTask == 1 or 2:
    allChart = pd.DataFrame(Series(topTen(analyzeAll).sum(), name='All Samples'))
    localChartX2 = pd.DataFrame(Series(topTen(analyzeThis).sum(), name=x2))
    topTenSummary = localChartX2.transpose()
    allChartSummary = allChart.transpose()

####this section turns top ten items(plus the other objects)
####into a percentage of the whole. by dividing by the Total column
## the data is used for ax1
# outPut is a DataFrame with 1 row and 11 columns
# row name is the location


###this is the output for ax1 of the city report
if theTask == 1 or 2:
    x2TopTenSummary = the2ColumnDivider(topTenSummary, 'Total')#<------ax1 inPut
    ####this is the topten of all the data provided
    allLocationsTopTenSummary = the2ColumnDivider(allChartSummary, 'Total')

####this section gives you the density or % values of the user criteria
## the Sorter is generated when the user chooses what category to sort by
## the material is sorted as default
## this gives output for the timeSeries charts
## and the average density of the users criteria and the material %
# outPut is DataFrame grouped by the user criteria


### This is the output for the timeSeries charts
### the output is for figure 2 ax1
if theTask == 1 or 2:
    allDensities = theDenseDivider(analyzeAll, 'length', dsort,'n')
    densitySource = theDenseDivider(analyzeThis, 'length', dsort, 'n')
    densityMat = theDenseDivider(analyzeThis, 'Total', dmat, 'y')
###this is the outPut for the summary density chart (user criteria)
## and the summary material percentage chart
matDensitySummaryLocation = pd.DataFrame(Series(densityMat.mean(), name=x2)).transpose()#<------ax2 inPut
sourceDensitySummaryLocation = pd.DataFrame(Series(densitySource.mean(), name=x2)).transpose()#<-----ax3 inPut
allDensitiesSource = pd.DataFrame(Series(allDensities.mean(), name = 'All Locations')).transpose()
####This section creates the string that displays
## the beach locations in the data acording to the users city selection
# outPut is a string based on the unique values of the index



###this section gets the info for the summary data
# outPut is string for ax5

if theTask == 1 or 2:
    nSamples = len(list(analyzeSource.index.get_level_values(0)))#<----- number of samples
    averageDensity = sourceDensitySummaryLocation.sum().sum().round(2)#<------ averaged density
    totalNumber = analyzeThis['Total'].sum()#------------ total number of pieces
    if theTask == 2:
        dates = densitySource.index.get_level_values(2)#<------- getting the dates from the index
    if theTask == 1:
        dates = densitySource.index.get_level_values(0)
        pd.to_datetime(dates)
    theEarliest = min(dates).date()#<--------- the earliest date
    theLatest = max(dates).date()#<------------- the latest date
#else:
#    nSamples = len(list(densitySource.index.get_level_values(0)))#<----- number of samples
#    averageDensity = densitySource.mean().sum().round(2)#<------ averaged density
#    totalNumber = localChartX2.loc['Total'].item()#------------ total number of pieces
#    dates = densitySource.index.get_level_values(1)#<------- getting the dates from the index
#    theEarliest = min(dates).date()#<--------- the earliest date
#    theLatest = max(dates).date()#<---------------- the latest date

###<-------------- ax5 input: summaryData 
summaryData = 'Litter density : ' + myCityName 

summaryData2 = 'Total number of pieces: ' + str(totalNumber) +'\nAverage density: ' + str(averageDensity) + ' pcs/m' + '\nFirst sample: ' + str(theEarliest) + '\nLast sample : ' + str(theLatest) + '\nNumber of samples: ' + str(nSamples) + '\n'



####this section prints out the items in the categories
## The Categories and the MLW codes are determined by the userCriteria file
# outPut is a string



if task == 1 or 2:
    mlwCodeInventory = stringMaker(densitySource, source)#<----- ax6 data

###This section gets the data to make the topten comparison chart
## as opposed to the individual chart this value is given as a density
## The topten of all the data is calculated
## then the Gcodes from that are pulled from the local data
# outPut is two dataFrames with one row and 11columns

### this function gets the top ten density values
## input is allthe data
# outPut is a dataFrame with 11 columns and one row

if task == 1 or 2:
    thisUn = pd.DataFrame(thetopTenDivider(analyzeThis, 'length', 'n')).mean()
    thatUn = pd.DataFrame(thetopTenDivider(topTenLength(analyzeAll), 'length', 'n')).mean()

### this function takes the index from 'thatUn'
## and pulls the corresponding value from 'thisUn'
# outPut is a dataFrame of the users selection

thisUnDf = otherTopTen(thatUn, thisUn)

### This function caluclates the value of 'Other objects' for 'thisUnDf'
## it drops the columns from the previous function
## sums up whats left and creates a series 'Other objects"
## that series gets appended to thisUnDf
# outPut is a df that will be charted

dropMe = dropColAndSum(thisUn, thisUnDf, 'length', thatUn.index[-1])

#converting to a dataFrame and transposing
thatUn = pd.DataFrame(thatUn)
thatUnOne = thatUn.transpose()


valsLocal = dropMe.iloc[0].values
valsGlobal = thatUnOne.iloc[0].values



theField =  analyzeAll.copy()
theField = theField[theField.Total.notnull()]
theField = theField[theField.length < 375]
theField['Dense'] = theField['Total']/theField['length']
theField['Logs'] = theField['Dense'].apply(np.log)
theField = theField[theField.Logs > -4.6] 
muField, sigmaField = stats.norm.fit(theField['Logs'])


th = analyzeThis.copy()
th = th[th.Total.notnull()]
th = th[th.length < 375]#<---------- the maximum length values were removed
th['Dense'] = th['Total']/th['length']
th['Logs'] = th['Dense'].apply(np.log)
th = th[th.Logs > -4.6] #<----------- this is equal to 1/100
mu, sigma = stats.norm.fit(th['Logs'])
theZscore = (th['Logs'].mean() - muField)/sigmaField
points = len(theField['Logs'])



In [47]:

    
fig = plt.figure(figsize=(8.5, 11), frameon=False, edgecolor='000000', linewidth = 1)


rect0 = .65, .71, .1, .24
rect1 = .65, .36, .1, .24
rect2 = .13, .36, .1, .24
rect3 = .23, .7, .06, .06 
rect4 = .08, .9, .4, .05
rect5 = .08, .82, .4, .09
rect6 = .13, .07, .4, .195
rect7 = .6, .12, .3, .09
rect8 = .62, .16, .2, .1
rect9 = .08, .66, .3, .02

ax1 = fig.add_axes(rect0)#<-----------x2TopTenSummary
ax2 = fig.add_axes(rect1)#<-----------matDensitySummaryLocation
ax3 = fig.add_axes(rect2)#<-----------sourceDensitySummaryLocation
ax4 = fig.add_axes(rect3)#<-----------performance
ax5 = fig.add_axes(rect4)#<-----------summaryDataTitle
ax8 = fig.add_axes(rect7)#<-----------summaryData
ax6 = fig.add_axes(rect5)#<-----------summaryData2
ax7 = fig.add_axes(rect6)#<----------- comparison chart
ax9 = fig.add_axes(rect8)#<----------- image

colors = ['#003333','#339966', '#66CCCC', '#3399CC', '#993333', '#CC6600', '#FF0033', '#FFCC00', '#3366CC', '#66CC00', '#333300', '#FF0000']


anuther = [ax1, ax2, ax3]

one = x2TopTenSummary.iloc[0].sort_values(ascending=False)
two = matDensitySummaryLocation.iloc[0].sort_values(ascending=False)
three = sourceDensitySummaryLocation.iloc[0].sort_values(ascending=False)

data = [one, two, three]



for t, r in enumerate(data):
    blocks = 0.1
    width = 0.6
    bottom = 0
    z = r.index
    q = anuther[t]
    lables = list(z)
    for i, u in enumerate(z):
        color = colors[i]
        q.bar(blocks, r[u], width=width, bottom = bottom, color = color, label= z[i])
        
        bottom += r[u]
    handles, labels = q.get_legend_handles_labels()
    q.legend(handles[::-1], labels[::-1], bbox_to_anchor=(1.05, 1), borderaxespad=0., fancybox=False, frameon=False)
    if t == 0:
        title = 'Top-ten MLW codes'
        ylabel = 'Topten objects as a % of total'
        q.set_ylim(top=102)
    elif t == 1:
        title = 'Materials'
        ylabel = 'Material as a % of total'
        q.set_ylim(top=102)
    elif t == 2:
        title = 'Density'
        ylabel = 'Pieces of trash per ' + unitsLabel
    q.set_title(title, size=14, loc='left')
    q.set_ylabel(ylabel, size = 12)
    q.tick_params(axis='x', labelbottom='off', bottom='off', top='off')
    nSamples = len(analyzeAll.loc[x2]['Total'])
    if nSamples > 1:
        noSample = 'samples'
    else:
        noSample = 'sample'
    q.set_xlabel('n= ' + str(nSamples) + ' ' + noSample, size=12)
    q.xaxis.labelpad = 20


percen = stats.percentileofscore(theField['Logs'], th['Logs'].mean()).round(0)
if percen <= 16:
    color = 'g'
elif 17 <= percen <= 49:
    color = 'y'
elif 50 <= percen < 83:
    color = '#FF4500'
elif 84 <= percen:
    color = 'r'
zText = 'Density is greater than '

theRest = '%\n of all other locations'
ax4.scatter(1, 1, s=500, color = color)
ax4.set_title('Overall performance')
ax4.set_xlabel(zText + str(percen) + theRest)
ax4.tick_params(axis='both', labelbottom='off', bottom='off', top='off', left='off')
ax4.set_yticklabels([])
ax4.set_xticklabels([])

ax5.text(0, 1, summaryData , size=16, wrap=True, verticalalignment='top')
ax6.text(0, 1, summaryData2, size=14, verticalalignment='top')


ax5.set_axis_off()
ax6.set_axis_off()

theLogs = theField['Logs']

theZscore = (theField['Logs'].mean() - muField)/sigmaField
sixteenth = muField + -1*sigmaField
eightyFourth = muField + 1*sigmaField
ratio = muField/sigmaField
zLessThanNegOne = sixteenth
zLessThanZero =  muField
zLessThanOne = eightyFourth


def logNormalCurve(theData, numberOfpoints):
    xmin = min(theData)
    xmax = max(theData)
    x = np.linspace(xmin, xmax, numberOfpoints)
    pdf = stats.norm.pdf(x, loc=muField, scale=sigmaField)
    return [x, pdf, xmin, xmax]

x = logNormalCurve(theLogs, points)[0]
pdf = logNormalCurve(theLogs, points)[1]
xmin = logNormalCurve(theLogs, points)[2]
xmax = logNormalCurve(theLogs, points)[3]
px = th['Logs'].mean()
py = stats.norm.pdf(px, loc=muField, scale=sigmaField)


ax7.plot(x, pdf, 'k', linestyle='dashed', color='b', alpha=1)
ax7.fill_between(x, stats.norm.pdf(x, loc=muField, scale=sigmaField), where=(x <= px), color='g', alpha= 0.3)
ax7.vlines(px, 0,  stats.norm.pdf(muField, loc=muField, scale=sigmaField), color='b', linestyle ='-.')
ax7.scatter(px, py, s=200, color = 'r')
ax7.spines['bottom'].set_position('zero')
ax7.set_title('The distibtuion of all values and this location')
ax7.set_xlabel('The ln of litter density', size=14)

today = date.today()
closingText = 'This report printed on ' + str(today) + '\nCreated using python 3.62\n Math, trash and water\n www.hammerdirt.ch'
ax8.text(0, 0, closingText , size=12, wrap=True, verticalalignment='top')
ax8.set_axis_off()
im = image.imread(imageFile)
ax9.imshow(im, aspect='auto', zorder=-1)
ax9.set_axis_off()
from matplotlib.backends.backend_pdf import PdfPages

plt.savefig('thisImage.png', bbox_inches = 'tight')
saveDocString = str(folder) + str(x2) + '_litter_Density.pdf'
pp = PdfPages(saveDocString)


fig.savefig(pp, format='pdf') 

pp.close()

plt.show()
plt.close()



In [25]:

    
fig = plt.figure(figsize=(8.5, 11), frameon=False, edgecolor='000000', linewidth = 1)

thisX =  pd.DataFrame(source.sum())
otaysiix = stringMaker(thisX, source)

idx = pd.IndexSlice
totals = analyzeAll['Total']
total = totals.sum().round(0)
theDates =  analyzeAll.index.get_level_values(1)
pd.to_datetime(dates)
firstDate =  min(theDates).date()
lastDate = max(theDates).date()
numberSamples = len(totals)
yourChoice = pd.Series(cityChoices)
thisString = ''
for i, x in enumerate(yourChoice):
    pos = str(i)
    city = x
    string = pos + ' - ' +  x + '\n'
    thisString = thisString + string
#print(thisString)


where = '\nIncludes  data from the following locations:\n' + '\n'+ thisString   
howManyTimes = '\nTotal number of samples: ' + str(numberSamples)
howMany = '\nTotal number of pieces: ' + str(total)
avDensity =  '\nAverage density: ' + str(allDensitiesSource.mean().sum().round(1)) 
first = '\nFirst sample: ' + str(firstDate)
last = '\nLast sample: ' + str(lastDate)



rect8 = .62, .16, .2, .1
rect9 = .08, .66, .3, .02


rect0 = 0.05, .7, .45, .25
rect1 = .3, .7, .18, .3
rect2 = .64, .2, .25, .75
rect3 = 0, 0, .7, .34
rect4 = 0.18, 0.4, .1, .22
rect5 = 0.17, 0.09, .1, .22
rect6 = .6, .12, .3, .09
rect7 = .62, .14, .2, .08
#gridSize = gridspec.GridSpec(24, 8)
ax1 = fig.add_axes(rect0)#, rowspan=10, colspan=2)
#ax2 = fig.add_axes(rect1)#, rowspan=10, colspan=2)
ax3 = fig.add_axes(rect2)
#ax5 = fig.add_subplot(gridSize[1:2, 7:])
#ax4 = fig.add_axes(rect3)
ax5 = fig.add_axes(rect4)
ax6 = fig.add_axes(rect5)
ax7 = fig.add_axes(rect6)
#ax8 = fig.add_axes(rect7)
ax1.text(0,1, where + howManyTimes + howMany + avDensity + first + last,verticalalignment='top', size=12)
ax1.set_title('Summary information for all data', loc='left', size=14)
#ax2.text(0,1, 'ax2')
ax3.text(0, 1, otaysiix  , size=10, wrap=True, verticalalignment='top')
ax3.set_title('Items by category', loc='right', size=14)
ax4.text(0,1, 'ax4')
anuther = [ax5, ax6]

one = x2TopTenSummary.iloc[0].sort_values(ascending=False)
two = matDensitySummaryLocation.iloc[0].sort_values(ascending=False)
three = sourceDensitySummaryLocation.iloc[0].sort_values(ascending=False)

oneAll = allLocationsTopTenSummary.iloc[0].sort_values(ascending=False)
twoAll = allDensitiesSource.iloc[0].sort_values(ascending=False)
#threeAll= 
data = [oneAll, twoAll]
nSamples = len(analyzeAll['Total'])


for t, r in enumerate(data):
    blocks = 0.1
    width = 0.6
    bottom = 0
    z = r.index
    q = anuther[t]
    lables = list(z)
    for i, u in enumerate(z):
        color = colors[i]
        q.bar(blocks, r[u], width=width, bottom = bottom, color = color, label= z[i])
        
        bottom += r[u]
    handles, labels = q.get_legend_handles_labels()
    q.legend(handles[::-1], labels[::-1], bbox_to_anchor=(1.05, 1), borderaxespad=0., fancybox=False, frameon=False)
    if t == 0:
        title = 'Top-ten MLW codes'
        ylabel = 'Topten objects as a % of total'
    elif t == 1:
        title = 'Density'
        ylabel = 'Pieces of trash per ' + unitsLabel
    elif t == 2:
        title = 'Density'
        ylabel = 'Pieces of trash per ' + unitsLabel
    q.set_title(title, size=14, loc='left')
    q.set_ylabel(ylabel, size = 12)
    q.tick_params(axis='x', labelbottom='off', bottom='off', top='off')
    if nSamples > 1:
        noSample = 'samples'
    else:
        noSample = 'sample'
    q.set_xlabel('n= ' + str(nSamples) + ' ' + noSample, size=12)
    q.xaxis.labelpad = 20

today = date.today()
closingText = 'This report printed on ' + str(today) + '\nCreated using python 3.62\n Math, trash and water\n www.hammerdirt.ch'
ax7.text(0, 0, closingText , size=12, wrap=True, verticalalignment='top')
ax7.set_axis_off()
#im = image.imread(imageFile)
#ax8.imshow(im, aspect='auto', zorder=-1)
#ax8.set_axis_off()
ax3.set_axis_off()
ax4.set_axis_off()
ax1.set_axis_off()

saveDocString = str(folder) +'summary.pdf'
pp = PdfPages(saveDocString)


fig.savefig(pp, format='pdf') 
pp.close()



plt.show()
plt.close()



In [163]:

    
gh = cityChoices
#densitySource = theDenseDivider(analyzeThis, 'length', dsort, 'n')#<-----ax3 inPut
#pd.DataFrame(Series(densitySource.mean(), name=x2)).transpose()#<-----ax3 inPut
antelopeDensity = theDenseDivider(analyzeAll.loc[gh[2]], 'length', dsort, 'n')
antelopeDensity = pd.DataFrame(Series(antelopeDensity.mean(), name=gh[2]))
theDataList = []
for i in cityChoices:
    theDensity = theDenseDivider(analyzeAll.loc[i], 'length', dsort, 'n')
    theDensity = pd.DataFrame(Series(theDensity.mean().sort_values(ascending=False), name=i))
    theDataList.append(theDensity)



In [178]:

    
#pd.DataFrame(Series(topTen(analyzeThis).sum(), name=x2))
#topTenSummary = localChartX2.transpose()
#x2TopTenSummary = the2ColumnDivider(topTenSummary, 'Total')
theTopTenList = []
for i in cityChoices:
    theTop = pd.DataFrame(Series(topTen(analyzeAll.loc[i]).sum().sort_values(ascending=False), name=i))
    theTop = theTop.transpose()
    theTop = the2ColumnDivider(theTop, 'Total')
    theTop.drop(['Total'], axis=1, inplace=True)
    theTopTenList.append(theTop)
theTopTenList[0].index.values[0]









    Out[178]:





'Arcade Creek'



In [32]:

    
from matplotlib.gridspec import GridSpec


            
theDensityList = []
theHighAndLow = []
for i in cityChoices:
    theDensity = theDenseDivider(analyzeAll.loc[i], 'length', dsort, 'n')
    theDensity = pd.DataFrame(Series(theDensity.mean(), name=i)).transpose()
    aTotal = theDensity.sum().sum()
    theHighAndLow.append(aTotal)
    theDensityList.append(theDensity)
    
max(theHighAndLow)
 
    
theTopTenList = []
for i in cityChoices:
    theTop = pd.DataFrame(Series(topTen(analyzeAll.loc[i]).sum().sort_values(ascending=False), name=i))
    theTop = theTop.transpose()
    theTop = the2ColumnDivider(theTop, 'Total')
    theTop.drop(['Total'], axis=1, inplace=True)
    theTopTenList.append(theTop)

colors = ['#003333','#339966', '#66CCCC', '#3399CC', '#993333', '#CC6600', '#FF0033', '#FFCC00', '#3366CC', '#66CC00', '#333300', '#FF0000']

Fig = plt.figure(figsize=(8.5, 11))
plt.subplots_adjust(left=None, bottom=None, right=None, top=None,
                wspace=.9, hspace=.5)
gs = GridSpec(3, 5)
ax1 = plt.subplot(gs[0, 0])
ax2 = plt.subplot(gs[0, 1])
ax3 = plt.subplot(gs[0, 2])
ax4 = plt.subplot(gs[0,3])
ax5 = plt.subplot(gs[1,0])
ax6 = plt.subplot(gs[1,1])
ax7 = plt.subplot(gs[1,2])
ax8 = plt.subplot(gs[1,3])
ax9 = plt.subplot(gs[2:, 0:3])
ax10 = plt.subplot(gs[2:, 3:])

topTenRow = [ax1, ax2, ax3, ax4]
densityRow = [ax5, ax6, ax7, ax8]
def charter(data, axes, listName):
    for t, r in enumerate(data):
        blocks = 0.1
        width = 0.6
        bottom = 0
        z = r.columns
        q = axes[t]
        lables = list(z)
        for i, u in enumerate(z):
            color = colors[i]
            q.bar(blocks, r[u], width=width, bottom = bottom, color = color, label= z[i])        
            bottom += r[u]
        title = r.index.values[0]
        q.set_title(title, size=11, loc='left')
        q.tick_params(axis='x', labelbottom='off', bottom='off', top='off')
        q.xaxis.labelpad = 10
        nSamples = len(analyzeAll.loc[title]['Total'])
        if nSamples > 1:
            noSample = 'samples'
        else:
            noSample = 'sample'
        q.set_xlabel('n= ' + str(nSamples) + ' ' + noSample, size=12)
        if t == 0:
            if listName == 'topTenList':
                ylabel = 'Top-ten items as % of total'
                q.set_ylabel(ylabel, size = 12)
            elif listName == 'densityList':
                ylabel = 'Pieces of trash per meter'
                q.set_ylabel(ylabel, size = 12)
                q.set_ylim(ymax=max(theHighAndLow))
        if listName == 'densityList':
            q.set_ylim(ymax=max(theHighAndLow))
    handles, labels = q.get_legend_handles_labels()
    q.legend(handles[::-1], labels[::-1], bbox_to_anchor=(1.4, 1), borderaxespad=0.2, fancybox=False, frameon=False)
    

charter(theTopTenList, topTenRow, 'topTenList')
charter(theDensityList, densityRow, 'densityList' )

plt.suptitle("Top ten items as % of total, density by category and distribution of results", size=14)

theField =  analyzeAll.copy()
theField = theField[theField.Total.notnull()]
theField = theField[theField.length < 375]
theField['Dense'] = theField['Total']/theField['length']
theField['Logs'] = theField['Dense'].apply(np.log)
theField = theField[theField.Logs > -4.6] 
muField, sigmaField = stats.norm.fit(theField['Logs'])

theLogs = theField['Logs']
#px1 = thefield[cityChoices[0]]['Logs'].mean() 
theZscore = (theField['Logs'].mean() - muField)/sigmaField
sixteenth = muField + -1*sigmaField
eightyFourth = muField + 1*sigmaField
ratio = muField/sigmaField
zLessThanNegOne = sixteenth
zLessThanZero =  muField
zLessThanOne = eightyFourth


px=[]
for name in cityChoices:
    thisPoint = theField.loc[name]['Logs'].mean()
    px.append(thisPoint)



def logNormalCurve(theData, numberOfpoints):
    xmin = min(theData)
    xmax = max(theData)
    x = np.linspace(xmin, xmax, numberOfpoints)
    pdf = stats.norm.pdf(x, loc=muField, scale=sigmaField)
    return [x, pdf, xmin, xmax]

x = logNormalCurve(theLogs, points)[0]
pdf = logNormalCurve(theLogs, points)[1]
xmin = logNormalCurve(theLogs, points)[2]
xmax = logNormalCurve(theLogs, points)[3]
#px = th['Logs'].mean()
#py = stats.norm.pdf(px, loc=muField, scale=sigmaField)


ax9.plot(x, pdf, 'k', linestyle='dashed', color='b', alpha=1)
#ax9.fill_between(x, stats.norm.pdf(x, loc=muField, scale=sigmaField), where=(x <= px), color='g', alpha= 0.3)
#ax9.vlines(px, 0,  stats.norm.pdf(muField, loc=muField, scale=sigmaField), color='b', linestyle ='-.')
ax9.scatter(px[0],stats.norm.pdf(px[0], loc=muField, scale=sigmaField), s=200, color = 'r')
ax9.scatter(px[1], stats.norm.pdf(px[1], loc=muField, scale=sigmaField), s=200, color = 'r')
ax9.scatter(px[2], stats.norm.pdf(px[2], loc=muField, scale=sigmaField), s=200, color = 'r')
#ax9.scatter(px[3], stats.norm.pdf(px[3], loc=muField, scale=sigmaField), s=200, color = 'r')
ax9.spines['bottom'].set_position('zero')
ax9.set_title('The log normal distribution of litter densities')
ax9.set_xlabel('The ln(litter density), Cordova Creek', size=14)
ax9.set_ylim(ymax=1)

im = image.imread(imageFile)

closingText = 'This report printed on ' + str(today) + '\nCreated using python 3.62\nMath, trash and water\nwww.hammerdirt.ch'                  
ax10.text(0, 0, closingText, size=12)                 
ax10.set_axis_off()

px1 = px[0]
py1 = stats.norm.pdf(px[0], loc=muField, scale=sigmaField)
px2 = px[1]
py2 = stats.norm.pdf(px[1], loc=muField, scale=sigmaField)
px3 = px[2]
py3 = stats.norm.pdf(px[2], loc=muField, scale=sigmaField)
px4 = px[3]
py4 = stats.norm.pdf(px[3], loc=muField, scale=sigmaField)

ax9.annotate('Arcade Creek',
            xy=(px1, py1), xycoords='data',
            xytext=(0, 20), textcoords='offset points',
            fontsize = 12, horizontalalignment='center', verticalalignment='bottom')

ax9.annotate('Cordova Creek II',
            xy=(px2, py2), xycoords='data',
            xytext=(0, 20), textcoords='offset points',
            fontsize = 12, horizontalalignment='center', verticalalignment='bottom')

ax9.annotate('Antelope',
            xy=(px3,py3), xycoords='data',
            xytext=(0, 20), textcoords='offset points',
            fontsize = 12, horizontalalignment='center', verticalalignment='bottom')






saveDocString = str(folder) +'ataglance.pdf'
pp = PdfPages(saveDocString)


Fig.savefig(pp, format='pdf') 
pp.close()

plt.show()
plt.close()



In [ ]: