In [1]:
import pandas as pd

In [2]:
%matplotlib inline

In [3]:
upcat_data = pd.read_csv("./passers.csv")
upcat_data[:5]


Out[3]:
Name Campus Course
0 AALA, ANJILA BIANCA LAO MANILA BS INDUSTRIAL PHARMACY (5 YRS)
1 AALA, IRMA RUTH PALUBON   **Pending Case
2 AALA, JADE GABRIELLE MAGLUNOB VISAYAS DEGREE PROGRAM WITH AVAILABLE SLOT
3 ABABA, ETHEL MAY GRACE LACERNA VISAYAS BA PSYCHOLOGY
4 ABACA, HANZ CHRISTIAN CALILAP LOS BAÑOS BS COMPUTER SCIENCE

In [4]:
course_data = upcat_data[['Course']]
course_data[:10]


Out[4]:
Course
0 BS INDUSTRIAL PHARMACY (5 YRS)
1 **Pending Case
2 DEGREE PROGRAM WITH AVAILABLE SLOT
3 BA PSYCHOLOGY
4 BS COMPUTER SCIENCE
5 DEGREE PROGRAM WITH AVAILABLE SLOT
6 BA EUROPEAN LANGUAGESWaitlisted: BS COMPUTER S...
7 BS DEVELOPMENT COMMUNICATION
8 BS BUSINESS ECONOMICS
9 BA BROADCAST COMMUNICATION

In [5]:
course_series = upcat_data['Course'].astype('category')

In [6]:
course_series.value_counts()[:10]


Out[6]:
DEGREE PROGRAM WITH AVAILABLE SLOT    3114
**Pending Case                        1538
BS BIOLOGY                             904
BS COMPUTER SCIENCE                    742
BS MATHEMATICS                         341
BS MANAGEMENT                          321
BA POLITICAL SCIENCE                   241
BS CHEMICAL ENGINEERING (5 YRS)        231
BA PSYCHOLOGY                          223
BS CIVIL ENGINEERING (5 YRS)           218
dtype: int64

In [7]:
course_series.value_counts()[2:10].plot(kind='bar')


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0xaf6f320>

In [8]:
names = upcat_data['Name'].astype('category')
names[:10]


Out[8]:
0            AALA, ANJILA BIANCA LAO
1            AALA, IRMA RUTH PALUBON
2      AALA, JADE GABRIELLE MAGLUNOB
3     ABABA, ETHEL MAY GRACE LACERNA
4      ABACA, HANZ CHRISTIAN CALILAP
5      ABACAN, AUSTIN SYMON BULACLAC
6     ABACAN, JOHN CHRISTIAN BARITUA
7    ABAD, ANGELICA SARAH ESTANISLAO
8        ABAD, BIANCA DENISE MENDOZA
9       ABAD, DUSTIN PAOLO SALVANERA
Name: Name, dtype: category
Categories (14988, object): [AALA, ANJILA BIANCA LAO < AALA, IRMA RUTH PALUBON < AALA, JADE GABRIELLE MAGLUNOB < ABABA, ETHEL MAY GRACE LACERNA ... ZULUETA, PIA YSABEL BANAS < ZULUETA, SANDRA MAE CARAAN < ZUNIGA, CINDERELLA COJAS < ZUNIGA, DARYLL KENDRICK CABIGAO]

In [9]:
upcat_data.insert(loc=0, column='Familyname', value='')
upcat_data[:5]


Out[9]:
Familyname Name Campus Course
0 AALA, ANJILA BIANCA LAO MANILA BS INDUSTRIAL PHARMACY (5 YRS)
1 AALA, IRMA RUTH PALUBON   **Pending Case
2 AALA, JADE GABRIELLE MAGLUNOB VISAYAS DEGREE PROGRAM WITH AVAILABLE SLOT
3 ABABA, ETHEL MAY GRACE LACERNA VISAYAS BA PSYCHOLOGY
4 ABACA, HANZ CHRISTIAN CALILAP LOS BAÑOS BS COMPUTER SCIENCE

In [10]:
def process_names(text):
    familyname, name = text.split(',')
    return pd.Series([familyname, name])

In [11]:
upcat_data[['Familyname', 'Name']] = upcat_data.Name.apply(process_names)
upcat_data[:5]


Out[11]:
Familyname Name Campus Course
0 AALA ANJILA BIANCA LAO MANILA BS INDUSTRIAL PHARMACY (5 YRS)
1 AALA IRMA RUTH PALUBON   **Pending Case
2 AALA JADE GABRIELLE MAGLUNOB VISAYAS DEGREE PROGRAM WITH AVAILABLE SLOT
3 ABABA ETHEL MAY GRACE LACERNA VISAYAS BA PSYCHOLOGY
4 ABACA HANZ CHRISTIAN CALILAP LOS BAÑOS BS COMPUTER SCIENCE

In [12]:
familynames = upcat_data['Familyname'].astype('category')
familynames[:5]


Out[12]:
0     AALA
1     AALA
2     AALA
3    ABABA
4    ABACA
Name: Familyname, dtype: category
Categories (8465, object): [AALA < ABABA < ABACA < ABACAN ... ZUBELDIA < ZULLA < ZULUETA < ZUNIGA]

In [13]:
familynames.value_counts()


Out[13]:
REYES        95
TAN          94
CRUZ         89
DELA CRUZ    83
SANTOS       80
GARCIA       80
LIM          73
RAMOS        68
MENDOZA      65
FERNANDEZ    54
CHUA         52
DE GUZMAN    51
BAUTISTA     50
GONZALES     49
ONG          46
...
SIGNEY      1
SIGLOS      1
LARRACAS    1
LASQUITE    1
LASALETA    1
LASAM       1
LASARA      1
SIERVO      1
LASCO       1
LASCUNA     1
SIENES      1
LASPINAS    1
LASQUERO    1
LASQUETE    1
JUMSALI     1
Length: 8465, dtype: int64

In [14]:
familynames.value_counts()[:10].plot(kind='bar')


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x10db4390>

In [15]:
campuses = upcat_data['Campus'].astype('category')
campuses[:10]


Out[15]:
0       MANILA
1             
2      VISAYAS
3      VISAYAS
4    LOS BAÑOS
5      DILIMAN
6      DILIMAN
7    LOS BAÑOS
8      DILIMAN
9      DILIMAN
Name: Campus, dtype: category
Categories (11, object): [BAGUIO < CEBU < DILIMAN < LOS BAÑOS ... PAMPANGA < TACLOBAN < VISAYAS < ]

In [16]:
campuses.value_counts()[:5]


Out[16]:
DILIMAN      5154
LOS BAÑOS    2530
             1538
VISAYAS      1523
BAGUIO       1281
dtype: int64

In [17]:
campuses.value_counts()[:5].plot(kind='bar')


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x10d1fb38>

In [ ]: