Working with Text Data in pandas


In [ ]:
import pandas as pd

time_sentences = ["Monday: Soccer is at 2:45pm.", 
                  "Tuesday: Appointment is at 11:30 am.",
                  "Wednesday: At 7:00pm, there is a python MOOC!",
                  "Thursday: Beer at 11:15 pm at the latest.",
                  "Friday: Starts at 08:10 am, ends at 09:00am."]

df = pd.DataFrame(time_sentences, columns=['text'])
df

In [ ]:
# find the number of chars for each string in df['text']
df['text'].str.len()

In [ ]:
# find the number of tokens for each string in df['text']
df['text'].str.split().str.len()

In [ ]:
df['text'].str.contains('Appointment')

In [ ]:
df['text'].str.count(r'\d')

In [ ]:
# find all digits
df['text'].str.findall(r'\d')

In [ ]:
# group and find the hours and minutes
df['text'].str.findall(r'(\d?\d):(\d\d)')

In [ ]:
# replace weekdays with '???'
df['text'].str.replace(r'\w+day\b', '???')

In [ ]:
# replace weekdays with 3 letter abbrevations
df['text'].str.replace(r'(\w+day\b)', lambda x: x.groups()[0][:3])

In [ ]:
# create new columns from first match of extracted groups
df['text'].str.extract(r'(\d?\d):(\d\d)', expand=True)

In [ ]:
# extract the entire time, the hours, the minutes, and the period
df['text'].str.extractall(r'((\d?\d):(\d\d) ?([ap]m))')

In [ ]:
# extract the entire time, the hours, the minutes, and the period with group names
df['text'].str.extractall(r'(?P<time>(?P<hour>\d?\d):(?P<minute>\d\d) ?(?P<period>[ap]m))')