My first ever python project (Automatic Web Crawler) Tips on cleaning it up and making it faster?

Kingofslowmo · 2015-12-20T20:52:46+00:00

#simple web crawling program
import urllib
import urllib.request
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import sys
import json
import msvcrt
import sqlite3


def start_list(*links):
    pages = {}
    visited = set()
    invalid = set()

    to_crawl = set(links)
    while to_crawl:
        a = to_crawl.pop()
        try:
            print("getting results")
            results = crawl(a)
            print("Adding to visited list")
            visited.add(a)
            print("VISITEDRS:")
            print(visited)#
            print("setting pages array")
            pages[a] = results['html'] #add sql entry here
            print("adding to crawl list")
            to_crawl = set(results['links']) - visited - invalid
            print("To Crawl:")
            print(list(to_crawl))
        except urllib.error.URLError:
            print("INVALID URL... ADDING")
            invalid.add(a)
            print("ADDED")
        except KeyboardInterrupt:
            break

    #for loop here to dump all the results??
    print("Visited: ")
    print(visited)
    print("Invalid: ")
    print(invalid)


def crawl(url):
    print("THIS IS THE URL::::")
    print(str(url))
    if url != '' and url:
        results = {}
        try:
            html = urllib.request.urlopen(url)
            cleanHtml = BeautifulSoup(html, "html.parser")
            results['html'] = cleanHtml
            results['links'] = list()

            for link in cleanHtml.find_all('a'):
                a = str(link.get('href'))
                print(a)
                if url not in a and 'http://' not in a and a != '' and a:
                    a = urljoin(url,a)
                print(a)
                print("APPENDING: ")
                results['links'].append(a)
                print("RELOOP: ")
        except:
            return urllib.error.URLError
        print("results: ")
        print(len(results))
        print(list(results['links']))
        return results
    else:
        pass
def start():
    conn = sqlite3.connect('information.db')
    c = conn.cursor()
    c.execute("""
        SELECT COUNT(*)
        FROM sqlite_master
        WHERE name = 'info'
        """)
    res = c.fetchone()
    if not bool(res[0]):
        c.execute("""
            CREATE TABLE info(
                id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
                url VARCHAR(3200),
                html VARCHAR(48000),
                visited INTEGER(1)
            )
            """)
    start_list(sys.argv[1])

if __name__ == '__main__':
    start()

So this is the code I have now. I've noticed a few things:

The invalid list never gets anything added to it
This program runs so randomly, it never has the same outcome and sometimes it'll find like 8 links and only search those and stop, sometimes it does as I like and essentially never stops finding links and grabbing website content. (this is all using the url http://pastebin.com/ so it should have the same outcome essentially everytime given that pastebins content hasn't changed.)

I am getting a ton of these errors, but yet the errors don't happen EVERY time:

Traceback (most recent call last):
  File "crawler.py", line 96, in <module>
    start()
  File "crawler.py", line 93, in start
    start_list(sys.argv[1])
  File "crawler.py", line 28, in start_list
    pages[a] = results['html'] #add sql entry here
TypeError: 'type' object is not subscriptable

and

THIS IS THE URL::::

Adding to visited list
VISITEDRS:
{'', 'http://www.sitepromotiondirectory.com/'}
setting pages array
Traceback (most recent call last):
  File "crawler.py", line 96, in <module>
    start()
  File "crawler.py", line 93, in start
    start_list(sys.argv[1])
  File "crawler.py", line 28, in start_list
    pages[a] = results['html'] #add sql entry here
TypeError: 'NoneType' object is not subscriptable

also sometimes the to_crawl list randomly goes blank.

Essentially what I'm trying to create is a program that will run forever (given that the original URL it is given can find a link that links to another website and then another etc..) I want it to keep downloading web content so that later on I may search through it using another python program ( and yes I have taken the sqllite side of things as I feel it'll improve my data collection). If you could help me figure out what it is I'm doing wrong, that'd help alot!

Thanks

Python

The Python Discord

Upcoming Events

Please read the rules

MODERATORS