import urllib
import urllib2
def get_next_target(page):
start_link = page.find('<a href='
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def print_all_links(page):
while True:
url, endpos = get_next_target(page)
if url:
print url
page = page[endpos:]
else:
break
ans = 'N'
while ans != 'n'.lower():
user_page=raw_input("Enter a url to attempt to crawl: ")
try:
user_content = urllib2.urlopen("https://" + user_page)
user_page = user_content.read()
print_all_links(user_page)
except urllib2.HTTPError, e:
print "Error can't crawl"
ans=raw_input('Search another site?(y/n)'.lower()
Copyright © 2024, NextGenUpdate.
All Rights Reserved.