A very insightful post @jasoleramos, thank you for sharing. I agree that RegEx analysis can be an extremely useful tool in identifying patterns within recurring document types in NLP applications. One example being in SEC 10-K Filings, where an analyst can leverage RegEx patterns to extract key financial data or information contained within 10-K filings in a systematic way, allowing to analyse 1000s of fillings at once in a seamless manner. Here's another example of RegEx analysis at work in the 10-K Filings example mentioned with the aim of extracting Items 1A, Item 7 and Item 7A from the filings as they are strategic for sentiment analysis, these sections are traditionally rich in information related to a company’s risk factors, management’s discussion of financial condition, and qualitative disclosures about market risk:
------------------------
import pandas as pd
import numpy as np
import time
import random
import requests
import re
from bs4 import BeautifulSoup
# request function
def send_request(url):
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
]
headers = {'User-Agent': random.choice(user_agents)}
response = requests.get(url, headers=headers)
time.sleep(random.uniform(1, 3))
return response
# get filing data
def get_filing_html(cik_info, accession_number, filing_type, filing_date):
if filing_type != '10-K': # skip non-10-K
return None
url = f"https://www.sec.gov/Archives/edgar/data/{cik_info['cik']}/{accession_number.replace('-', '')}/{accession_number}.txt"
try:
response = send_request(url)
response.raise_for_status()
doc_text = response.text # extract text
return {
'CIK': cik_info['cik'],
'Ticker': cik_info['tickers'][0] if cik_info.get('tickers') else 'N/A', # get ticker
'Accession Number': accession_number,
'SIC Description': cik_info.get('sicDescription', 'N/A'), # get SIC
'Text Document': doc_text, # store text
'Filing Type': filing_type,
'Filing Date': filing_date
}
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
return None
def extract_sentences(raw_10k):
# Regex patterns
doc_start_pattern = re.compile(r'<DOCUMENT>')
doc_end_pattern = re.compile(r'</DOCUMENT>')
type_pattern = re.compile(r'<TYPE>[^\n]+')
item_regex = re.compile(r'(>Item(\s| | )(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')
# finding document start and end indices
doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]
doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]
document = {}
# looping through each section type and save only the '10-K' section in dictionary
for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
if doc_type == '10-K':
document[doc_type] = raw_10k[doc_start:doc_end]
# finding matches using regex in '10-K' section
matches = item_regex.finditer(document.get('10-K', ''))
# creating a df from matches
item_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches], columns=['item', 'start', 'end'])
item_df['item'] = item_df['item'].str.lower()
# cleaning up unnecessary characters
item_df.replace(' ', ' ', regex=True, inplace=True)
item_df.replace(' ', ' ', regex=True, inplace=True)
item_df.replace(' ', '', regex=True, inplace=True)
item_df.replace('\.', '', regex=True, inplace=True)
item_df.replace('>', '', regex=True, inplace=True)
# dropping duplicates
pos_dat = item_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
# extracting content for each item
item_1a_raw = ''
item_7_raw = ''
item_7a_raw = ''
if 'item1a' in pos_dat['item'].values:
item_1a_start = pos_dat.loc[pos_dat['item'] == 'item1a', 'start'].values
item_1b_start = pos_dat.loc[pos_dat['item'] == 'item1b', 'start'].values
if item_1a_start.size > 0 and item_1b_start.size > 0:
item_1a_raw = document.get('10-K', '')[item_1a_start[0]:item_1b_start[0]]
if 'item7' in pos_dat['item'].values:
item_7_start = pos_dat.loc[pos_dat['item'] == 'item7', 'start'].values
item_7a_start = pos_dat.loc[pos_dat['item'] == 'item7a', 'start'].values
if item_7_start.size > 0 and item_7a_start.size > 0:
item_7_raw = document.get('10-K', '')[item_7_start[0]:item_7a_start[0]]
if 'item7a' in pos_dat['item'].values:
item_7a_start = pos_dat.loc[pos_dat['item'] == 'item7a', 'start'].values
item_8_start = pos_dat.loc[pos_dat['item'] == 'item8', 'start'].values
if item_7a_start.size > 0 and item_8_start.size > 0:
item_7a_raw = document.get('10-K', '')[item_7a_start[0]:item_8_start[0]]
# extracting sentences from BeautifulSoup objects
item_1a_content = BeautifulSoup(item_1a_raw, 'lxml').get_text()
item_7_content = BeautifulSoup(item_7_raw, 'lxml').get_text()
item_7a_content = BeautifulSoup(item_7a_raw, 'lxml').get_text()
# combining sentences from different items
all_sentences = f"{item_1a_content} {item_7_content} {item_7a_content}"
return all_sentences
# example usage
cik_info = {
'cik': '0000320193', # example CIK
'tickers': ['AAPL'], # example ticker
'sicDescription': 'Electronic Computers [3571]'
}
accession_number = '0001193125-11-282113'
filing_type = '10-K'
filing_date = '2011-10-26'
# get filing data
filing_data = get_filing_html(cik_info, accession_number, filing_type, filing_date)
if filing_data:
filing_data['Sentences'] = extract_sentences(filing_data['Text Document'])
print(pd.DataFrame([filing_data])[['CIK', 'Ticker', 'Filing Date', 'SIC Description', 'Sentences']])
Thank you very much, Max, for participating in the forum and for being a member of CuriousAI.net. I completely agree that RegEx is incredibly useful for extracting key financial data from financial reports like the 10-K. The Python solution you’ve shared for extracting Items 1A, 7, and 7A from the 10-K is very helpful. If you agree, perhaps you and Carlos Soler could review, generalize, and comment on the Python code so that it can be published to be used by any member of our community.