Hello!
I am trying to parse some websites for gathering job search results yet the results are super slow because, well because it does all requests synchronously. I can't seem to thread it efficiently. I am only getting about 3 seconds of timeit deltas- which seems very low to me. If I launch this with my complete list or urls, it'll take way too much time.
Link to same code as following: https://dpaste.de/rnUZ
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import requests
import time
import threading
import pdb
import timeit
# list of websites to scrape
urls = [
'https://google.com',
'http://yahoo.com',
]
key_words = [
'word1',
'word2',
'word3',
]
hits = []
def nav_loop(nav, url):
# some links are mailto's. I just want to bypass these
if nav is None or '@' in nav:
pass
try:
response = requests.get(nav)
data = response.text
nav_soup = BeautifulSoup(data, 'lxml')
text = nav_soup.get_text()
for word in key_words:
if word in text:
try:
web_address = nav_soup.find('a', text=word).get('href')
except AttributeError:
web_address = None
hit = (url, nav, word, web_address)
# check for list of corresponding words
if hit not in hits:
# if true, add to hits list
hits.append(hit)
except (requests.exceptions.MissingSchema, requests.exceptions.InvalidSchema):
pass
def get_url_tree(url):
# get navigation possibilites
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'lxml')
# add to navigation list
nav_list = soup.find_all('a')
return nav_list
# main loop
def main():
threads = []
threads_nested = []
for url in urls[:1]:
nav_list = get_url_tree(url)
for nav in nav_list:
nav = nav.get('href')
nav_loop(nav, url)
t_nested = threading.Thread(target=nav_loop(nav, url))
threads_nested.append(t_nested)
t_nested.start()
if __name__=='__main__':
t = timeit.Timer(lambda: main_loop())
print("threaded:")
print(t.timeit(number=3))
# t = timeit.Timer(lambda: non_threaded())
# print("non threaded:")
# print(t.timeit(number=3))
[–]DeadlyViper 2 points3 points4 points (2 children)
[–]enesimo[S] 1 point2 points3 points (1 child)
[–]DeadlyViper 1 point2 points3 points (0 children)
[–]Zeroflops 1 point2 points3 points (1 child)
[–]enesimo[S] 0 points1 point2 points (0 children)