In [1]:
import os
import pandas as pd
import matplotlib
import numpy as np
from pandas import Series, DataFrame
import csv
from scipy import stats
import matplotlib.pyplot as plt
import random
import matplotlib.colors as colors
from datetime import date
from datetime import time
from datetime import datetime
import sys
import matplotlib.image as image
from matplotlib.gridspec import GridSpec

In [2]:
## these are some of the methods used to clean the data prior to loading in the scripts for analysis
## You can do this how ever you like, but the goal is to have a matrix/dataframe/table
## That has location information(region, city), beach name, length(or area) then the litter data

#looking for and indenifying the folder that contain slr data
import re
thisDir = os.getcwd()
dirList = os.listdir(thisDir)
folderList = [x for x in dirList if not os.path.isfile(os.path.join(thisDir, x))]
t=re.compile('cedre')#<------ search parameters
theFolders = [x for x in folderList if t.search(x) != None]#<---- this makes the list of folders only in the cwd
if len(theFolders)>1:
    print(theFolders)#<--- if there is more than one match give me the list
else:
    myFolder = os.path.join(thisDir, theFolders[0])
    getIt = os.listdir(myFolder)
    print(getIt)#<--------- if there is just one match make a path


['cedreDict.csv', 'Bilan_MCMO_230817.csv.csv', 'BMCMO.csv', 'oneMore.csv', 'userFile.csv']

In [3]:
## Construct a path to your csv
## Then call .read_csv

thisCSV = myFolder + '/' + getIt[-3]
df = pd.read_csv(thisCSV, index_col=False)

In [4]:
# call df.columns and take a look
# Sometimes there columns that created in translation or output that are not part of the data
# There are two ways to do this, 1- drop what you don't need or 2-extract  what you do need
# But first get rid of all columns that are empty or have an 'Na' or 'Nan" 
df.columns


Out[4]:
Index(['RefNo', 'Beach name', 'Country', 'Region', 'Survey date', 'Period',
       'Plastic: Yokes [1]', 'Plastic: Bags [2]', 'Plastic: Small_bags [3]',
       'Plastic: Bag_ends [112]',
       ...
       'Survey: Old_rope_small [200]', 'Survey: Old_rope_large [201]',
       'Survey: Old_plastic_pieces [202]', 'Survey: Old_gloves [203]',
       'Survey: Old_cartons [204]', 'Survey: Old_oildrums_new [205]',
       'Survey: Old_oildrums_old [206]', 'Survey: Old_human_faeces [207]',
       'Survey: Old_animal_faeces [208]', 'Survey: Old_cloth_rope [210]'],
      dtype='object', length=132)

In [5]:
# in this case the there are at least eleven columns that have nothing to do with survey data
# the script does its own material calculations based on the user criteria file
# the first one is Unnamed that is the result of not turning off the index label when the file is wrtten to csv

colDrop = [df.columns[2]]
colDrop.append(df.columns[0])
df.drop(colDrop, axis=1, inplace=True)
df.columns


Out[5]:
Index(['Beach name', 'Region', 'Survey date', 'Period', 'Plastic: Yokes [1]',
       'Plastic: Bags [2]', 'Plastic: Small_bags [3]',
       'Plastic: Bag_ends [112]', 'Plastic: Drinks [4]',
       'Plastic: Cleaner [5]',
       ...
       'Survey: Old_rope_small [200]', 'Survey: Old_rope_large [201]',
       'Survey: Old_plastic_pieces [202]', 'Survey: Old_gloves [203]',
       'Survey: Old_cartons [204]', 'Survey: Old_oildrums_new [205]',
       'Survey: Old_oildrums_old [206]', 'Survey: Old_human_faeces [207]',
       'Survey: Old_animal_faeces [208]', 'Survey: Old_cloth_rope [210]'],
      dtype='object', length=130)

In [6]:
df.index


Out[6]:
RangeIndex(start=0, stop=170, step=1)

In [7]:
# call df.columns and see what you have
# so by procees of elimination we have settled for option 1
# we have gotten rid of almost all the columns that are not needed
# there is one left : 'Total', this is not calculated by the script
# therefore it is impossible to know how it got there

In [8]:
# without visuallising this it is still hard to 'tell' what is in the output.
# one thing is to eliminate all na values from the matirx/dataframe
df.fillna(0, inplace=True)
# all dataframe values are now numeric!
# Drop the exisitng total column (there is no way to know how it was really calculated)
#df.drop('Total', axis=1, inplace=True)
# create another Total columns
itemTotal = df.columns[4:]
df['Total'] = df[itemTotal].sum(axis=1)
# with a verified total get rid of any row values with a Total = 0 or less than 1
df = df[df.Total > 0]
lessThan = []
for p in list(df.columns[4:]):
        b = df[p].sum()
        if b == 0:
            lessThan.append(p)
df.drop(lessThan, axis=1, inplace=True)
# at this point analysis is possible of all litter values
# export this to csv and save for later
df.to_csv('cedreClean.csv', index_label=False)
df.drop('Total', axis=1, inplace=True)#<----- this needs to go for now, but we will put it back in later

In [9]:
df.columns


Out[9]:
Index(['Beach name', 'Region', 'Survey date', 'Period', 'Plastic: Yokes [1]',
       'Plastic: Bags [2]', 'Plastic: Small_bags [3]',
       'Plastic: Bag_ends [112]', 'Plastic: Drinks [4]',
       'Plastic: Cleaner [5]',
       ...
       'San: Toilet [101]', 'San: Other [102]', 'Med: Containers [103]',
       'Med: Syringes [104]', 'Med: Other [105]', 'Faeces: In_bags [121]',
       'Pollutants: Wax_small [108]', 'Pollutants: Wax_medium [109]',
       'Pollutants: Wax_large [110]', 'Pollutants: Other [111]'],
      dtype='object', length=117)

In [10]:
# the oupout is destined to a large audience, so being able to identify where these locations are is important
# take a look at beaach names and see if all of that makes sense
# eachrow represents a survey so, the same name will appear multiple times
# get rid of the doubles
names =[]
for x in list(df['Beach name']):
    if x not in names:
        names.append(x)
names        
# this is where things get sticky, the GPS coordinates need to be inlcuded
# output can be used for mapping and everything else
# get the gps from data provded by the users
# create a dict that relates name provided to gps before changing anything else


Out[10]:
['Blancs Sablons',
 'Kerizella',
 'Koubou',
 'La Grandville',
 'Larmor Plougastel',
 'Le Valais',
 'Porsmilin',
 'Sein',
 'Trielen']

In [12]:
choices = names #<----------- this is the list of locations that can be chosen later
# Choose a location to graph
for i, x in enumerate(choices):
        print(str(i) +' - '+ str(x), sep=', ')
userChoice = int(input('\nenter the index number of the city of your choice : ' ))


0 - Blancs Sablons
1 - Kerizella
2 - Koubou
3 - La Grandville
4 - Larmor Plougastel
5 - Le Valais
6 - Porsmilin
7 - Sein
8 - Trielen

enter the index number of the city of your choice : 0

In [13]:
# the user preferences are ready to go 
# need to enter the criteria from the user
# this allows the report data to be in any language
# the default model is in english however, if somebody makes criteria file
# in another language we will make that available too
# the reason the german criteria file works for this data set is because it was generated from this set
# therefore all the codes in the users data are in the criteria file
# furthermore report narratives can be adjusted
# summary table text is entered as variables, as well as chart titles
# and the user has the option to upload those as well

In [14]:
# upload the criteria file 
criteria = ('cedre/cedreDict.csv')

In [15]:
# run it thorught the function
# the output is a dataFrame with mlw codes as column headers and the criteria as rows
def userCriteria(csvFile):
    s1 = df.iloc[0:1, 4:]
    s2 = s1.to_dict('list')
    d1 = pd.read_csv(csvFile)
    cols = d1.columns
    if 'Unnamed: 0' in cols:
        d1.drop(['Unnamed: 0'], axis=1, inplace=True)
    #d1.set_index(['Type'], inplace=True)
    d2 = d1.to_dict('list')
    thisList = {}#<--------- an empty list to store our dcitionary in
    # iterate through all the G codes in the user data
    for i, x in enumerate(s2.keys()): #<------ Just looking at the dict keys
    # compare those to the G codes that are in the new criteria
        for y in (d2.keys()):
            # every G code provided by the user
            # is compared to the G code in the criteria list
            # so if a code in s2 is in d2 the following happens
            if x == y:
                # x = 'Some Gcode' from the list such as 'G23'
                # if a match is found
                # the G code is matched to the new criteria
                q = d2[x] #<----------- This picks up the value from the key, value pair in d2
                # stored in a dict
                new = {x: q} #<------- the Gcode from s2(aka x) picks up the value fromt the corresponding code in d2
                # that dict is put in a list
                thisList.update(new)
    ds = pd.DataFrame(thisList)
    #saveString = str(folder) + 'your_Criteria.csv'
    #ds.to_csv(saveString)
    #print('\nThe criteria has been matched to your data, the file is stored under ' +  str(saveString))
    return ds

ds = userCriteria(criteria)

In [16]:
#the data is ready to go and can be sorted using the criteria provided
# however the user has not identified what the data in each row of the criteria is
# there is afunction for that:


def sortIt():
    print('''\nThe contents of the criteria file need to be named.
    Please provide a one word description for the contents of each row.
    The minimum size criteria file is three rows plus the column headers
    -one row for your specific criteria
    -one row for a description of the MLW code
    -one row for the material\n''')
    
    print('''If the order and names of the rows in your criteria file are:
    \nRow one : Material
    \nRow two : Description
    \nRow three : Source
    \nOr if you are using the criteria file provided you can skip the next three questions''')
    skip = input('Skip the next three questions?(y/n) ')
    if skip == 'y':
        firstRow = 'Material'
        #secondRow = 'Description'
        #thirdRow = 'Source'
        #ds.insert(0, 'Code', [firstRow, secondRow, thirdRow])
        #ds.set_index('Code', inplace=True, drop=True)
    else:
        firstRow = input('Please provide a one word desription for the contents of the first row: \n')
        #secondRow = input('\nTitle for the second row: ')
        #thirdRow = input('\nTitle for the third row: ')    
        ds.insert(0, 'Code', [firstRow])#, secondRow, thirdRow])
        ds.set_index('Code', inplace=True, drop=True)
    mySort = input('\nWhich one would you like to use to group your data? ')
    #myMat = input('\nWhich one is the material type? ')
    dSortx = ds.loc[mySort]
    #dMatx = ds.loc[myMat]
    dsort = dSortx.to_dict()#<--------- this will be used to group by source
    #dmat = dMatx.to_dict()#<---------this will group by material
    return [dsort]

dsort = sortIt()

print('''At the end of the this script you can do other analysis on the formatted data
by calling 'userFile', it is a pandas DataFrame with the index set to City, Location, Date''')

# use the dict to create a groupby instance
# for each row in the criteria file you can create a groupby instance
# however row two is description so that wouldnt bevery helpful

def makeDfs():
    df.set_index(['Beach name', 'Survey date','Period'], inplace=True)
    userFile = df.copy()
    source = df.groupby(dsort, axis=1)#<-----groupby instance
    #mat = df.groupby(dmat, axis=1)#<-----groupby instance
    # here is the data that will be charted
    oneMore = pd.DataFrame(source.sum())
    #oneMat = pd.DataFrame(mat.sum())
    #oneMat['Total'] = oneMat.iloc[:, 1:].sum(axis=1)
    oneMore['Total'] = oneMore.iloc[:, 0:].sum(axis=1)
    userFile['Total'] = df.iloc[:, 1:].sum(axis=1)
    return [oneMore, userFile]

    
oneMore, userFile = makeDfs()

# export the different data frames as csv
# if somebody wants to verify the work or build their own anlaysis they can

def makeCsvs(x, y):
    for n, data in enumerate(x):
        name = y[n]
        saveString = 'cedre/' + name + '.csv' 
        data.to_csv(str(saveString), index_label=True)
        
makeCsvs([oneMore, userFile], ['oneMore', 'userFile'])


The contents of the criteria file need to be named.
    Please provide a one word description for the contents of each row.
    The minimum size criteria file is three rows plus the column headers
    -one row for your specific criteria
    -one row for a description of the MLW code
    -one row for the material

If the order and names of the rows in your criteria file are:
    
Row one : Material
    
Row two : Description
    
Row three : Source
    
Or if you are using the criteria file provided you can skip the next three questions
Skip the next three questions?(y/n) n
Please provide a one word desription for the contents of the first row: 
Material

Which one would you like to use to group your data? Material
At the end of the this script you can do other analysis on the formatted data
by calling 'userFile', it is a pandas DataFrame with the index set to City, Location, Date

In [114]:
q = userFile.columns[1:-1]
p = q.str.extract(':(\D+)\\[', expand=False)
abbrevName = []
#p.str.extract('(\D+)\\[')
for n, k in zip(q,p):
    namedict = {n:k}
    abbrevName.append(namedict)

In [179]:
####this section turns top ten items(plus the other objects)
####into a percentage of the whole. by dividing by the Total column
## the data is used for ax1
# outPut is a DataFrame with 1 row and 11 columns
# row name is the location

def topTen(file):
    k = list(file.columns)
    l = Series(file[k[1:-1]].sum(), name='Lake')
    l = l.sort_values(ascending=False)
    tenx = l.index.values[:10]
    elevenX = l.index.values[10:]
    checkThis = pd.DataFrame(file[tenx])
    theStuff = Series(file[elevenX].sum(axis=1), name='All other objects')
    checkThis = pd.concat([checkThis, theStuff], axis=1)
    this = pd.DataFrame(file['Total'])
    thisRightHere = pd.concat([checkThis, this], axis=1)
    return thisRightHere

def the2ColumnDivider(thedataframe, theDenominator):
    df = list(thedataframe.columns[0:-1])
    anotherList = []
    anotherDF = pd.DataFrame()
    for i, x in enumerate(df):
        op = thedataframe[df[i]]
        op2 = thedataframe[theDenominator]
        opRatio = (op/op2)*100
        thisRatio = Series(opRatio, name=df[i])
        anotherDF = pd.concat((anotherDF, thisRatio), axis=1)  
    return anotherDF

def theDenseDivider(theDataFrame, theSorter, percent):
    df = list(theDataFrame.columns[1:])
    anotherList = []
    anotherDF = pd.DataFrame()
    for i, x in enumerate(df):
        op = theDataFrame[df[i]]
        op2 = 100
        if percent == 'y':
            opRatio = (op/op2)*100
        else:
            opRatio = op/op2
        thisRatio = Series(opRatio, name=df[i])
        anotherDF = pd.concat((anotherDF, thisRatio), axis=1)
    densityGroup = anotherDF.groupby(theSorter, axis=1)
    sourceDensity = pd.DataFrame(densityGroup.sum())
    return sourceDensity

def noSortDenseDivider(theDataFrame, theDenominator, theSorter, percent):
    df = list(theDataFrame.columns[1:])
    anotherList = []
    anotherDF = pd.DataFrame()
    for i, x in enumerate(df):
        op = theDataFrame[df[i]]
        op2 = 100
        if percent == 'y':
            opRatio = (op/op2)*100
        else:
            opRatio = op/op2
        thisRatio = Series(opRatio, name=df[i])
        anotherDF = pd.concat((anotherDF, thisRatio), axis=1)
    densityGroup = anotherDF.groupby(theSorter, axis=1)
    sourceDensity = pd.DataFrame(densityGroup.sum())
    return sourceDensity

def topTenLength(file):
    k = list(file.columns)
    l = Series(file[k[1:-1]].sum(), name='Lake')
    l = l.sort_values(ascending=False)
    tenx = l.index.values[:10]
    elevenX = l.index.values[10:]
    checkThis = pd.DataFrame(file[tenx])
    theStuff = Series(file[elevenX].sum(axis=1), name='Other')
    checkThis = pd.concat([checkThis, theStuff], axis=1)
    this = pd.DataFrame(userFile['length'])
    thisRightHere = pd.concat([checkThis, this], axis=1)
    return thisRightHere

### this function divides the columns by either the total or the length
## you can select either percent 'y' (divide by total)
## or 'n' (divided by length)
# outPut is a dataFrame

def thetopTenDivider(theDataFrame, theDenominator, percent):
    df = list(theDataFrame.columns[:-1])
    anotherList = []
    anotherDF = pd.DataFrame()
    for i, x in enumerate(df):
        op = theDataFrame[df[i]]
        op2 = theDataFrame[theDenominator]
        if percent == 'y':
            opRatio = (op/op2)*100
        else:
            opRatio = op/op2
        thisRatio = Series(opRatio, name=df[i])
        anotherDF = pd.concat((anotherDF, thisRatio), axis=1)
    return anotherDF

def thetopTenDense(theDataFrame, theDenominator):
    df = list(theDataFrame.columns[0:-1])
    anotherList = []
    anotherDF = pd.DataFrame()
    for i, x in enumerate(df):
        op = theDataFrame[df[i]]
        op2 = theDenominator
        opRatio = op/op2
        thisRatio = Series(opRatio, name=df[i])
        anotherDF = pd.concat((anotherDF, thisRatio), axis=1)
    return anotherDF

def otherTopTen(theReferenceData, theData):
    thisUnDf = pd.DataFrame()
    for gCode in list(theReferenceData.index[:-1]):
        thisUnItem = Series(theData.loc[gCode], name=gCode)
        thisUnDf = pd.concat((thisUnDf, thisUnItem), axis=1)
    return thisUnDf

def dropColAndSum(dataToDropFrom, otherTopTenColsToDrop, lengthOrTotalcolumn, nameOtherObjects):
    dropMe = dataToDropFrom.drop(list(otherTopTenColsToDrop.columns))
    dropMe = Series(dropMe.drop(lengthOrTotalcolumn).sum(), name=nameOtherObjects)
    theOtherTopTen = pd.concat((thisUnDf, dropMe), axis=1)
    return theOtherTopTen

# top-ten-% of total
# use the userFile
# accepts either a slice or the entire data frame
def topTenInput(dataFrame):
    x = pd.DataFrame(Series(topTen(dataFrame).sum()))
    xtrans = x.transpose()
    ax1Data = the2ColumnDivider(xtrans, 'Total')
    ax1Data = ax1Data.iloc[0].sort_values(ascending=False)
    return ax1Data
    
#material as % of total
# use the userFile
# accepts either a slice or the entire data frame
def matPerInput(dataFrame):
    q = theDenseDivider(dataFrame, 'Total', dmat, 'y')
    ax2Input = pd.DataFrame(Series(q.mean(), name=x2)).transpose()
    ax2Input = ax2Input.iloc[0].sort_values(ascending=False)
    return ax2Input

# Source density input
# use the userFile
# accepts either a slice or the entire data frame
def sourceDenInput(dataFrame):
    q = theDenseDivider(dataFrame, dsort,'n')
    ax3Input = pd.DataFrame(Series(q.mean())).transpose()
    ax3Input = ax3Input.iloc[0].sort_values(ascending=False)
    return ax3Input
def topTenDensity(dataFrame):
    q = pd.DataFrame(Series(topTen(dataFrame).mean()))
    transq = q.transpose()
    topTenx = thetopTenDense(transq, 100)
    topTenx = topTenx.iloc[0].sort_values(ascending=False)
    return topTenx
    

###this section gets the info for the summary data
# outPut is string for ax5
# use the userFile
# accepts either a slice or the entire data frame
def summaryString(dataFrame):
    nSamples = len(list(dataFrame.index.get_level_values(0)))#<----- number of samples
    averageDensity = sourceDenInput(dataFrame).sum().sum().round(2)#<------ averaged density
    totalNumber = dataFrame['Total'].sum()#------------ total number of pieces
    dates = dataFrame.index.get_level_values('Survey date')#<------- getting the dates from the index
    pd.to_datetime(dates)
    theEarliest = min(dates)#<--------- the earliest date
    theLatest = max(dates)#<------------- the latest date
    logOfAverage = np.log(averageDensity)
    #summaryData = userChoice 
    summaryData2 = ('Total number of pieces: ' + str(totalNumber) +
                    '\nAverage density: ' + str(averageDensity) + ' pcs/m' + 
                    '\nFirst sample: ' + str(theEarliest) +
                    '\nLast sample : ' + str(theLatest) +
                    '\nNumber of samples: ' + str(nSamples) +
                    '\n')
    return [summaryData2, nSamples, logOfAverage]

def logsDense(dataFrame):
    z =  dataFrame.copy()
    z['Dense'] = z['Total']/100
    z['Logs'] = z['Dense'].apply(np.log)
    mu, sigma = stats.norm.fit(z['Logs'])
    points = len(z['Logs'])
    theLogs = z['Logs']
    return [mu, sigma, points, theLogs]

In [191]:
colors = ['#003333','#339966', '#66CCCC',
          '#3399CC', '#993333', '#CC6600', '#FF0033',
          '#FFCC00', '#3366CC', '#66CC00', '#333300',
          '#FF0000', '#ec00ff', '#00ff32']

def siteReport(dataFrame, dataFrameAll):
    
    one = topTenInput(dataFrame)
    two = sourceDenInput(dataFrame)
    three = topTenDensity(dataFrame)
    summaryData2, nSamples, logOfAverage = summaryString(dataFrame)
    summaryData = summaryTitle
    mu, sigma, points, logs= logsDense(dataFrame)
    theMu, theSigma, thePoints, theLogs = logsDense(dataFrameAll)

    fig = plt.figure(figsize=(8.27, 11.69), frameon=False, edgecolor='000000', linewidth = 1)


    rect0 = .65, .71, .1, .24
    rect1 = .65, .36, .1, .24
    rect2 = .13, .36, .1, .24
    rect3 = .23, .7, .06, .06 
    rect4 = .08, .9, .4, .05
    rect5 = .08, .82, .4, .09
    rect6 = .13, .07, .4, .195
    rect7 = .6, .12, .3, .09
    rect8 = .62, .16, .2, .1
    rect9 = .08, .66, .3, .02

    ax1 = fig.add_axes(rect0)#<-----------x2TopTenSummary
    ax2 = fig.add_axes(rect1)#<-----------matDensitySummaryLocation
    ax3 = fig.add_axes(rect2)#<-----------sourceDensitySummaryLocation
    ax4 = fig.add_axes(rect3)#<-----------performance
    ax5 = fig.add_axes(rect4)#<-----------summaryDataTitle
    ax8 = fig.add_axes(rect7)#<-----------summaryData
    ax6 = fig.add_axes(rect5)#<-----------summaryData2
    ax7 = fig.add_axes(rect6)#<----------- comparison chart
    ax9 = fig.add_axes(rect8)#<----------- image

   


    anuther = [ax1, ax2, ax3] # [ax1, ax2, ax3]
    data = [one, two, three]
    unitsLabel = 'meter'
        
    
    
    for t, r in enumerate(data):
        blocks = 0.1
        width = 0.6
        bottom = 0
        z = r.index
        q = anuther[t]
        lables = []
        for c in z:
            for n in abbrevName:
                for key, value in n.items():
                    if c == key:
                        lables.append(value)
        if len(lables) > 0:
            lables.append('Other objects')
        else:
            lables = z
        for i, u in enumerate(z):
            color = colors[i]
            q.bar(blocks, r[u], width=width, bottom = bottom, color = color, label= lables[i])
            bottom += r[u]
        handles, labels = q.get_legend_handles_labels()
        q.legend(handles[::-1], labels[::-1], bbox_to_anchor=(1.05, 1), borderaxespad=0., fancybox=False, frameon=False)
        if t == 0:
            title = 'Top-ten items found'
            ylabel = 'Topten objects as a % of total'
            q.set_ylim(top=102)
        elif t == 1:
            title = 'Materials'
            ylabel = 'Material pieces per meter'
        elif t == 2:
            title = 'Top-ten items per meter'
            ylabel = 'Pieces of trash per ' + unitsLabel
        q.set_title(title, size=14, loc='left')
        q.set_ylabel(ylabel, size = 12)
        q.tick_params(axis='x', labelbottom='off', bottom='off', top='off')
        nSamples = nSamples
        if nSamples > 1:
            noSample = 'samples'
        else:
            noSample = 'sample'
        q.set_xlabel('n= ' + str(nSamples) + ' ' + noSample, size=12)
        q.xaxis.labelpad = 20
                 


    percen = stats.percentileofscore(theLogs, logOfAverage).round(0)
    if percen <= 16:
        color = 'g'
    elif 17 <= percen <= 49:
        color = 'y'
    elif 50 <= percen < 83:
        color = '#FF4500'
    elif 84 <= percen:
        color = 'r'
    zText = 'Density is greater than '

    theRest = '%\n of all other locations'
    ax4.scatter(1, 1, s=500, color = color)
    ax4.set_title('Overall performance')
    ax4.set_xlabel(zText + str(percen) + theRest)
    ax4.tick_params(axis='both', labelbottom='off', bottom='off', top='off', left='off')
    ax4.set_yticklabels([])
    ax4.set_xticklabels([])

    ax5.text(0, 1, summaryData , size=16, wrap=True, verticalalignment='top')
    ax6.text(0, 1, summaryData2, size=14, verticalalignment='top')

    ax5.set_axis_off()
    ax6.set_axis_off()

    theZscore = (theLogs.mean() - theMu)/theSigma
    sixteenth = theMu + -1*theSigma
    eightyFourth = theMu + 1*theSigma
    ratio = theMu/theSigma
    zLessThanNegOne = sixteenth
    zLessThanZero =  theMu
    zLessThanOne = eightyFourth


    def logNormalCurve(theData, numberOfpoints):
        xmin = min(theData)
        xmax = max(theData)
        x = np.linspace(xmin, xmax, numberOfpoints)
        pdf = stats.norm.pdf(x, loc=theMu, scale=theSigma)
        return [x, pdf, xmin, xmax]

    x, pdf, xmin, xmax = logNormalCurve(theLogs, thePoints)
    px = logOfAverage
    #z['Dense'] = z['Total']/100
    py = stats.norm.pdf(px, loc=theMu, scale=theSigma)


    ax7.plot(x, pdf, 'k', linestyle='dashed', color='b', alpha=1)
    ax7.fill_between(x, stats.norm.pdf(x, loc=theMu, scale=theSigma), where=(x <= px), color='g', alpha= 0.3)
    ax7.vlines(px, 0,  stats.norm.pdf(theMu, loc=theMu, scale=theSigma), color='b', linestyle ='-.')
    ax7.scatter(px, py, s=200, color = 'r')
    ax7.spines['bottom'].set_position('zero')
    ax7.set_title('The distibtuion of all values and this location')
    ax7.set_xlabel('The ln of litter density', size=14)

    today = date.today()
    closingText = 'This report printed on ' + str(today) + '\nCreated using python 3.62\n Math, trash and water\n www.hammerdirt.ch'
    ax8.text(0, 0, closingText , size=12, wrap=True, verticalalignment='top')
    ax8.set_axis_off()
    imageFile = 'weblogo.jpg'
    im = image.imread(imageFile)
    ax9.imshow(im, aspect='auto', zorder=-1)
    ax9.set_axis_off()

    from matplotlib.backends.backend_pdf import PdfPages

    plt.savefig(summaryData, bbox_inches = 'tight')
    saveDocString = str(summaryData) + '_litter_Density.pdf'
    pp = PdfPages(saveDocString)


    fig.savefig(pp, format='pdf')

    pp.close()

    plt.show()
    plt.close()

In [192]:
for x in choices:
    summaryTitle = str(x[:25])
    siteReport(userFile.loc[x], userFile)



In [ ]: