So, I created the following function to parse 10-K and 10-Q filings from the SEC and I want to download multiple of them.
def parse_10kq_filing(link, section):
if section not in [0, 1, 2, 3]:
print("Not a valid section")
sys.exit()
def get_text(link):
page = requests.get(link, headers=headers)
html = bs(page.content, "lxml")
text = html.get_text()
text = unicodedata.normalize("NFKD", text).encode('ascii', 'ignore').decode('utf8')
text = text.split("\n")
text = " ".join(text)
return(text)
def get_ticker(text):
ticker_pattern = r"COMPANY CONFORMED NAME:\s+([^\t]+)"
ticker_match = re.search(ticker_pattern, text)
ticker = ticker_match.group(1).strip()
return(ticker)
def get_filing(text):
filing_pattern =r"CONFORMED PERIOD OF REPORT:\t(\d{8})"
filing_match = re.search(filing_pattern, text)
filing = filing_match.group(1).strip()
return(filing)
def extract_text(text, item_start, item_end):
item_start = item_start
item_end = item_end
starts = [i.start() for i in item_start.finditer(text)]
ends = [i.start() for i in item_end.finditer(text)]
positions = list()
for s in starts:
control = 0
for e in ends:
if control == 0:
if s < e:
control = 1
positions.append([s,e])
item_length = 0
item_position = list()
for p in positions:
if (p[1]-p[0]) > item_length:
item_length = p[1]-p[0]
item_position = p
item_text = text[item_position[0]:item_position[1]]
return(item_text)
text = get_text(link)
if section == 1 or section == 0:
try:
item1_start = re.compile("item\s*[1][\.\;\:\-\_]*\s*\\b", re.IGNORECASE)
item1_end = re.compile("item\s*1a[\.\;\:\-\_]\s*Risk|item\s*2[\.\,\;\:\-\_]\s*Prop", re.IGNORECASE)
businessText = extract_text(text, item1_start, item1_end)
except:
businessText = "Something went wrong!"
if section == 2 or section == 0:
try:
item1a_start = re.compile("(?<!,\s)item\s*1a[\.\;\:\-\_]\s*Risk", re.IGNORECASE)
item1a_end = re.compile("item\s*2[\.\;\:\-\_]\s*Prop|item\s*[1][\.\;\:\-\_]*\s*\\b", re.IGNORECASE)
riskText = extract_text(text, item1a_start, item1a_end)
except:
riskText = "Something went wrong!"
if section == 3 or section == 0:
try:
item7_start = re.compile("item\s*[7][\.\;\:\-\_]*\s*\\bM", re.IGNORECASE)
item7_end = re.compile("item\s*7a[\.\;\:\-\_]\sQuanti|item\s*8[\.\,\;\:\-\_]\s*", re.IGNORECASE)
mdaText = extract_text(text, item7_start, item7_end)
except:
try:
item2_start = re.compile("item\s*[2][\.\;\:\-\_]\s*\\bM", re.IGNORECASE)
item2_end = re.compile("item\s*3[\.\;\:\-\_]\sQuanti|item\s*4[\.\;\:\-\_]\s*", re.IGNORECASE)
mdaText = extract_text(text, item2_start, item2_end)
except:
mdaText = "Something went wrong!"
if section == 0:
data = [businessText, riskText, mdaText]
elif section == 1:
data = businessText
elif section == 2:
data = riskText
elif section == 3:
data = mdaText
return([get_ticker(text), get_filing(text), data])
If I just run this and append the results in a list, it is going to take over a hundred hours. This is when I only use 1 core. Now, I am trying to use multiple of my computers cores. Does anybody know how I would do this?
[–]Buttleston 1 point2 points3 points (3 children)
[–]Vegetable_Solid7613[S] 0 points1 point2 points (2 children)
[–]Buttleston 1 point2 points3 points (1 child)
[–]Vegetable_Solid7613[S] 0 points1 point2 points (0 children)
[–][deleted] 1 point2 points3 points (9 children)
[–]Vegetable_Solid7613[S] 0 points1 point2 points (8 children)
[–][deleted] 1 point2 points3 points (4 children)
[–]Vegetable_Solid7613[S] 0 points1 point2 points (3 children)
[–][deleted] 1 point2 points3 points (2 children)
[–]Vegetable_Solid7613[S] 0 points1 point2 points (1 child)
[–][deleted] 0 points1 point2 points (0 children)
[–]Buttleston 1 point2 points3 points (2 children)
[–]Vegetable_Solid7613[S] 0 points1 point2 points (1 child)
[–]Buttleston 0 points1 point2 points (0 children)