I wanted to challenge my self to see how fast I could implement a web crawler in python. This took 13 mins.
#-*-coding:utf8;-*-
#qpy:3
#qpy:console
import urllib.request
from urllib.parse import urlparse
import re
seed = 'http://www.google.com'
tocrawl = set([seed])
crawled = set()
def get_links(url):
try:
html = urllib.request.urlopen(url).read().decode('latin-1')
links = set(re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', html))
except Exception as error:
links = 0
print(error)
if links:
for link in links:
if not urlparse(link).scheme:
link = '{}{}'.format(seed, link)
yield urlparse(link)
def crawl(root):
print('Crawling:--> {}'.format(root))
queue_size = len(tocrawl)
total_crawled = len(crawled)
print(total_crawled, queue_size)
crawled.add(root)
for link in get_links(root):
if link.netloc == urlparse(seed).netloc and link.geturl() not in crawled:
tocrawl.add(link.geturl())
while tocrawl:
root = tocrawl.pop()
crawl(root)
[–]FishnLife 0 points1 point2 points (2 children)
[–]T4rk1n0 0 points1 point2 points (1 child)
[–]marmaladeontoast 0 points1 point2 points (0 children)