Series


In [5]:
import pandas as pd
import numpy as np

In [6]:
series1 = pd.Series(['a', 'b', 'c'])
print(series1[1:3])


1    b
2    c
dtype: object

In [12]:
# Viewing Specific Keys
record = pd.Series({
    'firstname': 'Charles',
    'lastname': 'Givre',
    'middle': 'classfied'
})

record[['firstname', 'lastname']]
# firstname    Charles
# lastname       Givre

# String manipulations: Show True/False for each record
record.str.contains('Cha')
# firstname     True
# lastname     False
# middle       False

# Filter based on the returned True/False
record[record.str.contains('Cha')]

# Other string manipulations
# Record.str.contains... functions
# => contains, count, extract, find, findall, len


Out[12]:
firstname    Charles
dtype: object

In [8]:
# Display head/tail number of records

randomNumbers = pd.Series(
    np.random.randint(1, 100, 50)
)

randomNumbers.head(10)
# randomNumbers.tail(7)


Out[8]:
0    28
1    81
2    22
3    40
4    29
5     9
6     3
7    56
8    60
9    79
dtype: int64

In [9]:
# Filtering Data in a Series
# 1. Generate True/False values for all values
randomNumbers < 10
# 2. Filter based on True/False values and only show records
randomNumbers[randomNumbers < 10]


Out[9]:
5     9
6     3
18    9
22    4
32    1
33    2
40    8
42    9
dtype: int64

In [17]:
small_rands = randomNumbers[randomNumbers < 10]
small_rands


Out[17]:
5     9
6     3
18    9
22    4
32    1
33    2
40    8
42    9
dtype: int64

In [18]:
def addTwo(n):
    return n + 2

In [19]:
# iterates through the entire series and apply addTwo function
small_rands.apply(addTwo)


Out[19]:
5     11
6      5
18    11
22     6
32     3
33     4
40    10
42    11
dtype: int64

In [20]:
# Use lambda instead
small_rands.apply(lambda x: x + 1)


Out[20]:
5     10
6      4
18    10
22     5
32     2
33     3
40     9
42    10
dtype: int64

In [32]:
# Remove Missing Values:
Series.dropna()

# Replace missing values with "something" value
Series.fillna(value="<something>"

DataFrame


In [ ]:
data = pd.DataFrame(<data>, <index>, <column_names>)

# 1. Pass two dimentional data -> Series
# 2. Usually, reading from outside sources
data = pd.read_csv(<file>)
data = pd.read_excel('file.xls')
data = pd.read_json(<file>/<url>)
data = pd.read_sql(<query>, <connection_obj>)
data = pd.read_html(<source>)
logdf = pd.read_table('../data/mysql.log', names=['raw'])
# refer to a column:
logdf['raw'].str.extract(
    '(?P<data>]d{6}\s\d{2}:\d{2}:\d{2}:\d{2})...', 
    expand=False
)

In [ ]:
# Web Server Logs
# 1. Complicated to parse - use apache_log_parser package
import apache_log_parser
line_parser = apache_log_parser.make_parser("%h %l %u %t \'%r\' %>s %b \'%{Referer}i\' \'%{User-agent}i\' ")
# pandas are moving to arrow data structure
server_log = open("../data/hackers-access.httpd", "r")
parsed_server_data = []
# loop and add to parsed_server_data and move into a dataframe

In [ ]:
# Manipulating datafarme
df = data['column'] # returns series
df['ip'].value_counts().head() # counts unique IPs!
# return the following columns
df = data[['column1', 'column2', 'column3']] # returns a DataFrame

# extract columns and filter
# 1. show specified columns
# 2. Filter(any column)
df[['col1', 'col2']][col3 > 5]

# pull out individual rows
data.loc[<index>]
data.loc[<list of indexes>]
data.sample[<n>] # return a random sample of a dataset

# Apply to DataFrame
data.apply(<function>)

# - function will receive Series == each row
# - function will return a new row => allows us to add new columns

In [ ]:
# Apply a function to a column and then create a new column
df = pd.read_csv('data/dailybots.csv')
# df['orgs'] + 2 => will add 2 to the entire column
df['orgs2'] = df['orgs'] + 2 # this will create a new column

In [ ]:
# Transpose
data.T => Reshaping data

# aggregation
# 1. the sum of columns
data.sum(axis=0)
# 2. the sum of the rows
data.sum(axis=1) # from operating on a column to row

# drop
# 1. inplace => change the current dataframe
# 2. errors => specify an error
data.drop(labels, axis=0, level=None, inplace=False, errors='raise')

In [ ]:
# Merging datasets

# Union
Series 1 + Series 2
combinedSeries = pd.concat([series1, series2], ...)

# Join
# 1. Inner Join -> common things between sets
# 2. Outer Join -> 
# 3. Left Join -> all the data in set A, A and B, but not B
# 4. Right Join -> same
pd.merge(leftData, rightData, 
         how="<join type / inner,lerft,right,outer>",
         on="list of fields")

In [ ]:
# Grouping and Aggregating data
df_grouped = df.groupby(
    ['Protocol', 'Source', 'Destination']
)
print(df_grouped.size())

stats_packets = df_grouped['Length'].agg({'No Packets': len, 'Volume': sum,
                                         'SD': lambda x: np.std(x, ddoff=1...)})