This is for Task 3. (Descriptive section)


In [43]:
def describe(data): 
    # test sample = 't3sample.csv'
    # importing data
    import pandas as pd
    df2 = pd.read_csv(data)
    #print(df2.head())

    # description of latitude
    print("Latitude description: The latitude data ranges from %f to %f, with %d numbers in all,\
    the mean value is %f, the standard deviation is %f. The 1st quartile value is %f, \
    the median value is %f, the 3rd quartile value is %f." % (
    df2["latitude"].describe()[3],df2["latitude"].describe()[7],int(df2["latitude"].describe()[0]),
    df2["latitude"].describe()[1],df2["latitude"].describe()[2],df2["latitude"].describe()[4],
    df2["latitude"].describe()[5],df2["latitude"].describe()[6]))
    
    # description of longitude.
    print("Longitude description: The latitude data ranges from %f to %f, with %d numbers in all,\
    the mean value is %f, the standard deviation is %f. The 1st quartile value is %f, \
    the median value is %f, the 3rd quartile value is %f." % (
    df2["longitude"].describe()[3],df2["longitude"].describe()[7],int(df2["longitude"].describe()[0]),
    df2["longitude"].describe()[1],df2["longitude"].describe()[2],df2["longitude"].describe()[4],
    df2["longitude"].describe()[5],df2["longitude"].describe()[6]))

    ## description of catogorical data.
    #import collections
    from collections import Counter
    # 01 words description.
    print("There are %d types of categorical data, among all, the most common item is %s, with the occurrence of %d."%(
            len(Counter(df2["categorical"])),Counter(df2["categorical"]).most_common(1)[0][0],Counter(df2["categorical"]).most_common(1)[0][1]))

    # 02 detect all the catogories and plot the bar plot (sorted)
    import collections
    cc=collections.Counter(df2["categorical"])
    vv=[]
    kk=[]
    for v,k in sorted(cc.items(), key=lambda i: i[1], reverse=True):
        vv.append(v)
        kk.append(k)

    objects = vv
    import numpy as np
    y_pos = np.arange(len(objects))
    performance = kk

    import matplotlib.pyplot as plt
    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.ylabel('number of grants')
    plt.title('Program types vs their counts')
    plt.show()

    # 03 pie chart
    import matplotlib.pyplot as plt
    # Pie chart, where the slices will be ordered and plotted counter-clockwise:
    labels = tuple(vv)
    sizes = [x/len(df2["categorical"])*100 for x in kk]
    
    fig1, ax1 = plt.subplots()
    ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
            shadow=True, startangle=90)
    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

    plt.show()

In [44]:
describe('t3sample.csv')


Latitude description: The latitude data ranges from -89.995631 to 89.974703, with 10000 numbers in all,    the mean value is 1.026321, the standard deviation is 51.545570. The 1st quartile value is -43.319612,     the median value is 1.219217, the 3rd quartile value is 46.321107.
Longitude description: The latitude data ranges from -179.756945 to 179.946696, with 10000 numbers in all,    the mean value is -0.082380, the standard deviation is 103.546299. The 1st quartile value is -89.584439,     the median value is 0.070930, the 3rd quartile value is 88.911360.
There are 10 types of categorical data, among all, the most common item is Category04, with the occurrence of 1077.

In [ ]: