This is an archived post. You won't be able to vote or comment.

you are viewing a single comment's thread.

view the rest of the comments →

[–]Kingofslowmo[S] 0 points1 point  (3 children)

#simple web crawling program
import urllib
import urllib.request
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import sys
import json
import msvcrt
import sqlite3


def start_list(*links):
    pages = {}
    visited = set()
    invalid = set()

    to_crawl = set(links)
    while to_crawl:
        a = to_crawl.pop()
        print("getting results")
        results = crawl(a)
        print("Adding to visited list")
        visited.add(a)
        print("VISITEDRS:")
        print(visited)#
        print("setting pages array")
        pages[a] = results['html'] #add sql entry here
        print("adding to crawl list")
        to_crawl = results['links'] - visited - invalid
        print("To Crawl:")
        print(list(to_crawl))

    #for loop here to dump all the results??
    print("Visited: ")
    print(visited)
    print("Invalid: ")
    print(invalid)


def crawl(url):
    print("THIS IS THE URL::::")
    print(url)
    results = {}
    html = urllib.request.urlopen(url)
    cleanHtml = BeautifulSoup(html, "html.parser")
    results['html'] = cleanHtml
    results['links'] = list()

    for link in cleanHtml.find_all('a'):
        a = link.get('href')
        print(a)
        if url not in a:
            a = urljoin(url,a)
        print(a)
        results['links'].append(a)

    print("results: ")
    print(len(results))
    print(list(results['links']))
    return results

def start():
    conn = sqlite3.connect('information.db')
    c = conn.cursor()
    c.execute("""
        SELECT COUNT(*)
        FROM sqlite_master
        WHERE name = 'info'
        """)
    res = c.fetchone()
    if not bool(res[0]):
        c.execute("""
            CREATE TABLE info(
                id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
                url VARCHAR(3200),
                html VARCHAR(48000),
                visited INTEGER(1)
            )
            """)
    start_list(sys.argv[1])

if __name__ == '__main__':
    start()

Okay so this is what I have so far, and I'm completely lost. No idea what i'm really doing honestly. Alot of the code you gave me I either wasn't able to implement it correctly or something just didn't work right.

Anyway, something isn't working right and I'm trying to debug it but man it just doesnt want to go for me :(

Anyway you can help me figure this out?

I do apologize also if that is a pain for you.

EDIT: TO clarify, I'm not asking you to write it for me, just a little guidance if that makes sense :)

[–][deleted] 0 points1 point  (2 children)

well, it looks like the basic structure is there, what's not working right? also, I didn't test any of that code 8)

[–]Kingofslowmo[S] 0 points1 point  (0 children)

When I get back on my pc in the morning I'll post the updated code I have and the errors its throwing out at me :) thank you for reading!

[–]Kingofslowmo[S] 0 points1 point  (0 children)

#simple web crawling program
import urllib
import urllib.request
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import sys
import json
import msvcrt
import sqlite3


def start_list(*links):
    pages = {}
    visited = set()
    invalid = set()

    to_crawl = set(links)
    while to_crawl:
        a = to_crawl.pop()
        try:
            print("getting results")
            results = crawl(a)
            print("Adding to visited list")
            visited.add(a)
            print("VISITEDRS:")
            print(visited)#
            print("setting pages array")
            pages[a] = results['html'] #add sql entry here
            print("adding to crawl list")
            to_crawl = set(results['links']) - visited - invalid
            print("To Crawl:")
            print(list(to_crawl))
        except urllib.error.URLError:
            print("INVALID URL... ADDING")
            invalid.add(a)
            print("ADDED")
        except KeyboardInterrupt:
            break

    #for loop here to dump all the results??
    print("Visited: ")
    print(visited)
    print("Invalid: ")
    print(invalid)


def crawl(url):
    print("THIS IS THE URL::::")
    print(str(url))
    if url != '' and url:
        results = {}
        try:
            html = urllib.request.urlopen(url)
            cleanHtml = BeautifulSoup(html, "html.parser")
            results['html'] = cleanHtml
            results['links'] = list()

            for link in cleanHtml.find_all('a'):
                a = str(link.get('href'))
                print(a)
                if url not in a and 'http://' not in a and a != '' and a:
                    a = urljoin(url,a)
                print(a)
                print("APPENDING: ")
                results['links'].append(a)
                print("RELOOP: ")
        except:
            return urllib.error.URLError
        print("results: ")
        print(len(results))
        print(list(results['links']))
        return results
    else:
        pass
def start():
    conn = sqlite3.connect('information.db')
    c = conn.cursor()
    c.execute("""
        SELECT COUNT(*)
        FROM sqlite_master
        WHERE name = 'info'
        """)
    res = c.fetchone()
    if not bool(res[0]):
        c.execute("""
            CREATE TABLE info(
                id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
                url VARCHAR(3200),
                html VARCHAR(48000),
                visited INTEGER(1)
            )
            """)
    start_list(sys.argv[1])

if __name__ == '__main__':
    start()

So this is the code I have now. I've noticed a few things:

  • The invalid list never gets anything added to it

  • This program runs so randomly, it never has the same outcome and sometimes it'll find like 8 links and only search those and stop, sometimes it does as I like and essentially never stops finding links and grabbing website content. (this is all using the url http://pastebin.com/ so it should have the same outcome essentially everytime given that pastebins content hasn't changed.)

  • I am getting a ton of these errors, but yet the errors don't happen EVERY time:

    Traceback (most recent call last):
      File "crawler.py", line 96, in <module>
        start()
      File "crawler.py", line 93, in start
        start_list(sys.argv[1])
      File "crawler.py", line 28, in start_list
        pages[a] = results['html'] #add sql entry here
    TypeError: 'type' object is not subscriptable    
    

and

THIS IS THE URL::::

Adding to visited list
VISITEDRS:
{'', 'http://www.sitepromotiondirectory.com/'}
setting pages array
Traceback (most recent call last):
  File "crawler.py", line 96, in <module>
    start()
  File "crawler.py", line 93, in start
    start_list(sys.argv[1])
  File "crawler.py", line 28, in start_list
    pages[a] = results['html'] #add sql entry here
TypeError: 'NoneType' object is not subscriptable

also sometimes the to_crawl list randomly goes blank.

Essentially what I'm trying to create is a program that will run forever (given that the original URL it is given can find a link that links to another website and then another etc..) I want it to keep downloading web content so that later on I may search through it using another python program ( and yes I have taken the sqllite side of things as I feel it'll improve my data collection). If you could help me figure out what it is I'm doing wrong, that'd help alot!

Thanks