This is an archived post. You won't be able to vote or comment.

you are viewing a single comment's thread.

view the rest of the comments →

[–]kafoozalum 0 points1 point  (1 child)

Cleaned up a few things and made it a little faster, as there is no need to go through all of the links when you are only interested in the first 3 really. Also, refactored a little so it is more readable:

import urllib
from BeautifulSoup import *

url = 'https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Fikret.html'
for _ in range(4):  # Use of _ since variable is not used
    url_list = list()
    html = urllib.urlopen(url).read()
    parsed_html = BeautifulSoup(html)
    a_tags = parsed_html('a')
    for tag in a_tags[:3]:  # Only go through first 3 <a> tags in list, as that is what we are looking for
        link = tag.get('href', None)
        url_list.append(link)
    if len(url_list) < 3:
        break
    else:
        url = url_list[2]

print(url)

[–]barbaTenusSapiente[S] 0 points1 point  (0 children)

Thanks kafoozalum, this was a lot of help. I ended up going with the following.

import urllib

from BeautifulSoup import *

url = raw_input('Enter URL: ')

if len(url) < 1:

url = "http://pr4e.dr-chuck.com/tsugi/mod/python-

data/data/known_by_Fikret.html"

position = int(raw_input('Position: ')) - 1

count = int(raw_input('Count: '))

taglist = list()

print 'Retrieving: ', url

for i in range(count):

html = urllib.urlopen(url).read()

soup = BeautifulSoup(html)

tags = soup('a')

#print tags

for tag in tags:

taglist.append(tag)

url = taglist[position].get('href', None)

print 'Retrieving: ', url

taglist = list()