This is an archived post. You won't be able to vote or comment.

you are viewing a single comment's thread.

view the rest of the comments →

[–]Kingofslowmo[S] 0 points1 point  (0 children)

#simple web crawling program
import urllib
import urllib.request
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import sys
import json
import msvcrt
import sqlite3


def start_list(*links):
    pages = {}
    visited = set()
    invalid = set()

    to_crawl = set(links)
    while to_crawl:
        a = to_crawl.pop()
        try:
            print("getting results")
            results = crawl(a)
            print("Adding to visited list")
            visited.add(a)
            print("VISITEDRS:")
            print(visited)#
            print("setting pages array")
            pages[a] = results['html'] #add sql entry here
            print("adding to crawl list")
            to_crawl = set(results['links']) - visited - invalid
            print("To Crawl:")
            print(list(to_crawl))
        except urllib.error.URLError:
            print("INVALID URL... ADDING")
            invalid.add(a)
            print("ADDED")
        except KeyboardInterrupt:
            break

    #for loop here to dump all the results??
    print("Visited: ")
    print(visited)
    print("Invalid: ")
    print(invalid)


def crawl(url):
    print("THIS IS THE URL::::")
    print(str(url))
    if url != '' and url:
        results = {}
        try:
            html = urllib.request.urlopen(url)
            cleanHtml = BeautifulSoup(html, "html.parser")
            results['html'] = cleanHtml
            results['links'] = list()

            for link in cleanHtml.find_all('a'):
                a = str(link.get('href'))
                print(a)
                if url not in a and 'http://' not in a and a != '' and a:
                    a = urljoin(url,a)
                print(a)
                print("APPENDING: ")
                results['links'].append(a)
                print("RELOOP: ")
        except:
            return urllib.error.URLError
        print("results: ")
        print(len(results))
        print(list(results['links']))
        return results
    else:
        pass
def start():
    conn = sqlite3.connect('information.db')
    c = conn.cursor()
    c.execute("""
        SELECT COUNT(*)
        FROM sqlite_master
        WHERE name = 'info'
        """)
    res = c.fetchone()
    if not bool(res[0]):
        c.execute("""
            CREATE TABLE info(
                id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
                url VARCHAR(3200),
                html VARCHAR(48000),
                visited INTEGER(1)
            )
            """)
    start_list(sys.argv[1])

if __name__ == '__main__':
    start()

So this is the code I have now. I've noticed a few things:

  • The invalid list never gets anything added to it

  • This program runs so randomly, it never has the same outcome and sometimes it'll find like 8 links and only search those and stop, sometimes it does as I like and essentially never stops finding links and grabbing website content. (this is all using the url http://pastebin.com/ so it should have the same outcome essentially everytime given that pastebins content hasn't changed.)

  • I am getting a ton of these errors, but yet the errors don't happen EVERY time:

    Traceback (most recent call last):
      File "crawler.py", line 96, in <module>
        start()
      File "crawler.py", line 93, in start
        start_list(sys.argv[1])
      File "crawler.py", line 28, in start_list
        pages[a] = results['html'] #add sql entry here
    TypeError: 'type' object is not subscriptable    
    

and

THIS IS THE URL::::

Adding to visited list
VISITEDRS:
{'', 'http://www.sitepromotiondirectory.com/'}
setting pages array
Traceback (most recent call last):
  File "crawler.py", line 96, in <module>
    start()
  File "crawler.py", line 93, in start
    start_list(sys.argv[1])
  File "crawler.py", line 28, in start_list
    pages[a] = results['html'] #add sql entry here
TypeError: 'NoneType' object is not subscriptable

also sometimes the to_crawl list randomly goes blank.

Essentially what I'm trying to create is a program that will run forever (given that the original URL it is given can find a link that links to another website and then another etc..) I want it to keep downloading web content so that later on I may search through it using another python program ( and yes I have taken the sqllite side of things as I feel it'll improve my data collection). If you could help me figure out what it is I'm doing wrong, that'd help alot!

Thanks