In [ ]:
import pandas as pd
time_sentences = ["Monday: Soccer is at 2:45pm.",
"Tuesday: Appointment is at 11:30 am.",
"Wednesday: At 7:00pm, there is a python MOOC!",
"Thursday: Beer at 11:15 pm at the latest.",
"Friday: Starts at 08:10 am, ends at 09:00am."]
df = pd.DataFrame(time_sentences, columns=['text'])
df
In [ ]:
# find the number of chars for each string in df['text']
df['text'].str.len()
In [ ]:
# find the number of tokens for each string in df['text']
df['text'].str.split().str.len()
In [ ]:
df['text'].str.contains('Appointment')
In [ ]:
df['text'].str.count(r'\d')
In [ ]:
# find all digits
df['text'].str.findall(r'\d')
In [ ]:
# group and find the hours and minutes
df['text'].str.findall(r'(\d?\d):(\d\d)')
In [ ]:
# replace weekdays with '???'
df['text'].str.replace(r'\w+day\b', '???')
In [ ]:
# replace weekdays with 3 letter abbrevations
df['text'].str.replace(r'(\w+day\b)', lambda x: x.groups()[0][:3])
In [ ]:
# create new columns from first match of extracted groups
df['text'].str.extract(r'(\d?\d):(\d\d)', expand=True)
In [ ]:
# extract the entire time, the hours, the minutes, and the period
df['text'].str.extractall(r'((\d?\d):(\d\d) ?([ap]m))')
In [ ]:
# extract the entire time, the hours, the minutes, and the period with group names
df['text'].str.extractall(r'(?P<time>(?P<hour>\d?\d):(?P<minute>\d\d) ?(?P<period>[ap]m))')