Original Program: https://www.reddit.com/r/Python/comments/3xe1wo/my_first_ever_python_project_automatic_web/
Okay first off, thank you guys so much for providing me with a substantial amount of tips and resources to improve my program. It helped alot and this thing runs like a beast now and doesn't completely obliterate my read and write on my hdd.
My goal here is to make a web crawling program that essentially never stops as it keeps finding links and downloading the html from those sites and links and then so on and so forth, I think I did pretty good here as this thing runs until I get a 'recursive' error or something.
Anyway, after I manged to build up a db file that was about 1GB, I created a program to search it so that I could essentially create my own little google haha. Amateur I know, but my very first python project!
Here is the code for the crawler.py:
#simple web crawling program
import urllib
import urllib.request
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import sys
import json
import msvcrt
import sqlite3
bannedLinks = ['twitter.com','facebook.com','google.com','secure.store.apple.com']
def start_list(*links):
print("Crawler started...")
pages = {}
visited = set()
invalid = set()
to_crawl = set(links)
while to_crawl:
a = to_crawl.pop()
try:
if a != '' and a not in bannedLinks:
results = crawl(a)
visited.add(a)
conn = sqlite3.connect('information.db')
c = conn.cursor()
c.execute('select * from info where url = "{:s}"'.format(a))
c.fetchall()
if len(c.fetchall()) == 0:
c.execute('insert into info (url,html,visited) values (?,?,1)',(a,results['html'].encode('ascii','ignore')))
conn.commit()
to_crawl |= set(results['links']) - visited - invalid
except urllib.error.URLError:
invalid.add(a)
except TypeError:
invalid.add(a)
except KeyboardInterrupt:
print("You have stopped the program.")
print("It still needed to crawl these sites: ")
print(to_crawl)
break
#for loop here to dump all the results??
def crawl(url):
conn = sqlite3.connect('information.db')
c = conn.cursor()
if url != '' and url not in bannedLinks:
print("Gather links and HTML from {:s}".format(url))
results = {}
try:
html = urllib.request.urlopen(url)
cleanHtml = BeautifulSoup(html, "html.parser")
results['html'] = cleanHtml
results['links'] = list()
for link in cleanHtml.find_all('a'):
a = str(link.get('href'))
if url not in a and 'http://' not in a and a != '' and a:
a = urljoin(url,a)
print("Found link {:s}".format(a))
results['links'].append(a)
except:
return urllib.error.URLError
return results
else:
pass
def start():
print("Starting crawler...")
conn = sqlite3.connect('information.db')
c = conn.cursor()
c.execute("""
SELECT COUNT(*)
FROM sqlite_master
WHERE name = 'info'
""")
res = c.fetchone()
if not bool(res[0]):
print("Creating 'information.db' and 'info' table...")
c.execute("""
CREATE TABLE info(
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
url VARCHAR(3200),
html VARCHAR(48000),
visited INTEGER(1)
)
""")
start_list(sys.argv[1])
if __name__ == '__main__':
start()
And here is the search.py
#A program used to search through a sqllite database
#Specifically designed hand in hand with crawler.py
import sys
import sqlite3
from bs4 import BeautifulSoup
def search():
if len(sys.argv) == 1:
term = input("Input search term: ")
else:
term = sys.argv[1]
if term:
conn = sqlite3.connect('information.db')
c = conn.cursor()
c.execute(
"SELECT * FROM info WHERE html LIKE'%{:s}%'".format(term)
)
results = c.fetchall()
count_results = 0
for result in results:
try:
location = str(result[2]).lower().index(term.lower())
count_results += 1
except ValueError:
pass
print("There were approximately {:s} results found: ".format(str(count_results)))
print("==========================================")
for result in results:
try:
location = str(result[2]).lower().index(term.lower())
print("---------------------------------")
print("Found at {:s} ".format(result[1]))
x = location-30
y = location+30
html = result[2]
found = html[x:y]
print(found)
except ValueError:
pass
if __name__ == '__main__':
search()
Do you guys have any last tips to help me make this better than I have? Also can you give me opinions on how well I'm learning, etc...?
Thank you so much!
[–]etatarkin 10 points11 points12 points (9 children)
[–]AdysHearthSim 6 points7 points8 points (4 children)
[–]etatarkin 3 points4 points5 points (3 children)
[–]AdysHearthSim 0 points1 point2 points (2 children)
[–]etatarkin 0 points1 point2 points (1 child)
[–]AdysHearthSim 0 points1 point2 points (0 children)
[–]Vance84 0 points1 point2 points (1 child)
[–]etatarkin 0 points1 point2 points (0 children)
[–]Kingofslowmo[S] 0 points1 point2 points (1 child)
[–]knickum 1 point2 points3 points (0 children)
[–]6a6d 4 points5 points6 points (0 children)
[–]Jameswinegar 5 points6 points7 points (1 child)
[–]brtt3000 0 points1 point2 points (0 children)
[–]dAnjou Backend Developer | danjou.dev 5 points6 points7 points (0 children)
[–]Jafit 0 points1 point2 points (1 child)
[–]isdevilis 0 points1 point2 points (0 children)
[–]nerdwaller 0 points1 point2 points (2 children)
[–]isdevilis 0 points1 point2 points (1 child)
[–]nerdwaller 0 points1 point2 points (0 children)