Title: String Munging In Dataframe
Slug: pandas_string_munging
Summary: String Munging In Dataframe
Date: 2016-05-01 12:00
Category: Python
Tags: Data Wrangling
Authors: Chris Albon
In [1]:
import pandas as pd
import numpy as np
import re as re
In [2]:
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
'email': ['jas203@gmail.com', 'momomolly@gmail.com', np.NAN, 'battler@milner.com', 'Ames1234@yahoo.com'],
'preTestScore': [4, 24, 31, 2, 3],
'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'email', 'preTestScore', 'postTestScore'])
df
Out[2]:
In [3]:
df['email'].str.contains('gmail')
Out[3]:
In [4]:
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
In [5]:
df['email'].str.findall(pattern, flags=re.IGNORECASE)
Out[5]:
In [6]:
matches = df['email'].str.match(pattern, flags=re.IGNORECASE)
matches
Out[6]:
In [7]:
matches.str[1]
Out[7]: