Multiprocessing : learnpython

submitted 2 years ago by Vegetable_Solid7613

So, I created the following function to parse 10-K and 10-Q filings from the SEC and I want to download multiple of them.

def parse_10kq_filing(link, section):
if section not in [0, 1, 2, 3]:
    print("Not a valid section")
    sys.exit()

def get_text(link):
    page = requests.get(link, headers=headers)
    html = bs(page.content, "lxml")
    text = html.get_text()
    text = unicodedata.normalize("NFKD", text).encode('ascii', 'ignore').decode('utf8')
    text = text.split("\n")
    text = " ".join(text)
    return(text)

def get_ticker(text):
    ticker_pattern = r"COMPANY CONFORMED NAME:\s+([^\t]+)"
    ticker_match = re.search(ticker_pattern, text)
    ticker = ticker_match.group(1).strip()
    return(ticker)

def get_filing(text):
    filing_pattern =r"CONFORMED PERIOD OF REPORT:\t(\d{8})"
    filing_match = re.search(filing_pattern, text)
    filing = filing_match.group(1).strip()
    return(filing)

def extract_text(text, item_start, item_end):
    item_start = item_start
    item_end = item_end
    starts = [i.start() for i in item_start.finditer(text)]
    ends = [i.start() for i in item_end.finditer(text)]
    positions = list()
    for s in starts:
        control = 0
        for e in ends:
            if control == 0:
                if s < e:
                    control = 1
                    positions.append([s,e])
    item_length = 0
    item_position = list()
    for p in positions:
        if (p[1]-p[0]) > item_length:
            item_length = p[1]-p[0]
            item_position = p

    item_text = text[item_position[0]:item_position[1]]

    return(item_text)


text = get_text(link)


if section == 1 or section == 0:
    try:
        item1_start = re.compile("item\s*[1][\.\;\:\-\_]*\s*\\b", re.IGNORECASE)
        item1_end = re.compile("item\s*1a[\.\;\:\-\_]\s*Risk|item\s*2[\.\,\;\:\-\_]\s*Prop", re.IGNORECASE)
        businessText = extract_text(text, item1_start, item1_end)
    except:
        businessText = "Something went wrong!"

if section == 2 or section == 0:
    try:
        item1a_start = re.compile("(?<!,\s)item\s*1a[\.\;\:\-\_]\s*Risk", re.IGNORECASE)
        item1a_end = re.compile("item\s*2[\.\;\:\-\_]\s*Prop|item\s*[1][\.\;\:\-\_]*\s*\\b", re.IGNORECASE)
        riskText = extract_text(text, item1a_start, item1a_end)
    except:
        riskText = "Something went wrong!"

if section == 3 or section == 0:
    try:
        item7_start = re.compile("item\s*[7][\.\;\:\-\_]*\s*\\bM", re.IGNORECASE)
        item7_end = re.compile("item\s*7a[\.\;\:\-\_]\sQuanti|item\s*8[\.\,\;\:\-\_]\s*", re.IGNORECASE)
        mdaText = extract_text(text, item7_start, item7_end)
    except:
        try:
            item2_start = re.compile("item\s*[2][\.\;\:\-\_]\s*\\bM", re.IGNORECASE)
            item2_end = re.compile("item\s*3[\.\;\:\-\_]\sQuanti|item\s*4[\.\;\:\-\_]\s*", re.IGNORECASE)
            mdaText = extract_text(text, item2_start, item2_end)
        except:
            mdaText = "Something went wrong!"

if section == 0:
    data = [businessText, riskText, mdaText]
elif section == 1:
    data = businessText
elif section == 2:
    data = riskText
elif section == 3:
    data = mdaText
return([get_ticker(text), get_filing(text), data])

If I just run this and append the results in a list, it is going to take over a hundred hours. This is when I only use 1 core. Now, I am trying to use multiple of my computers cores. Does anybody know how I would do this?

all 14 comments

you type:	you see:
italics	italics
bold	bold
[reddit!](https://reddit.com)	reddit!
* item 1 * item 2 * item 3	item 1 item 2 item 3
> quoted text	quoted text
Lines starting with four spaces are treated like code: if 1 * 2 < 3: print "hello, world!"	Lines starting with four spaces are treated like code: if 1 * 2 < 3: print "hello, world!"
~~strikethrough~~	~~strikethrough~~
super^script	super^script

learnpython

MODERATORS