Some experiments in website scraping using Python 2.7 with BeautifulSoup 3.2. The first function here shows various manipulations of an HTML page, including saving a scrubbed file to disk. The second function shows a simple crawler that attempts to traverse a domain and build a sitemap from hyperlinks encountered in the pages. Includes some commentary on page encoding, parsers and multiple approaches to some tasks.
from urllib2 import urlopen
from urllib2 import HTTPError
from urllib2 import URLError
from BeautifulSoup import BeautifulSoup, Comment
import re
def webScraper():
""" Connect to remote site, grab page """
try:
html = urlopen("http://mysafeinfo.com/api/data?list=bloodtypesdistlist&format=html")
#html = urlopen("http://monasterphoto.com/stones/stones_prev22.htm")
except HTTPError as e:
print(e)
except URLError as e:
print("The server could not be found!")
else:
print("It worked!")
'''
No need to specify parser unless you want to override default. BS3 uses SGMLParser;
BS4 will use lxml if installed, otherwise defaulting to html.parser.
'''
soupObj = BeautifulSoup(html)
# Print entire document
print soupObj
# Print first h1 tag, with tags
print soupObj.h1
# Print value and list of values of title tag
print soupObj.title.string
print soupObj.title.contents
#---------Working with the table----------#
# Returns entire table
bloodTypeTable = soupObj("table")
print bloodTypeTable
# Return tables whose table tag has 'summary' attribute
bloodTypeTable = soupObj.find("table", {"summary" : re.compile(".*")})
# Alt. method
#bloodTypeTable = soupObj.find("table", summary=True)
print bloodTypeTable
# Return only the value of the summary attribute
bloodTypeTable = soupObj.find("table", {"summary" : re.compile(".*")})["summary"]
print bloodTypeTable
# Return all contents of specific <table> tags
bloodTypeTable = soupObj.find("table", summary=True).contents
print bloodTypeTable
# Return all child <td> tags of specific <table> tag
bloodTypeTable = soupObj.find("table", summary=True).findChildren("td")
print bloodTypeTable
#-----------------------------------------#
# Find all the comments (<!-- -->) within a document
comments = soupObj.findAll(text=lambda text:isinstance(text, Comment))
print comments
# Remove all span tags, output file to disk:
for tag in soupObj.findAll("span"):
tag.decompose()
print soupObj.prettify()
if not soupObj.title.string:
fileName = str(random.randint(1,5000) + ".html")
else:
fileName = soupObj.title.string + ".html"
output = open(fileName, "w")
output.write(soupObj.prettify())
output.close()
webScraper()
# Simple domain crawler that builds a sitemap from links on index page
# Consider using scrapy for this instead
pages = set()
def getLinks(pageUrl):
global pages
#print pageUrl
html = urlopen(pageUrl)
soupObj = BeautifulSoup(html)
for link in soupObj.findAll("a", href=re.compile("geoffstratton.com")):
#print link
#print link.get('href')
if link.get('href') not in pages:
# We found a new page
'''
By default, BeautifulSoup only processes text files whose contents fall
within the ASCII control and printable character sets (codes 0-127).
Attempting to parse other file types with BS 3 will cause sgmllib to throw a
UnicodeEncodeError here, e.g., "'ascii' codec can't encode character
u'xff' in position 5: ordinal not in range(128)" for the JPEG marker.
If you are parsing text other than basic ASCII, specify the original
encoding in the format soupObj = BeautifulSoup(html, from_encoding="iso-885908"),
where from_encoding is the actual page encoding. If you don't know offhand
you can use the meta charset value. For output use print soupObj.prettify("latin-1")
again using the correct encoding.
For image processing, try Pillow:
catPicture = Image.open("cat.jpg")
catPictureBlurred = catPicture.filter(ImageFilter.GaussianBlur)
catPictureBlurred.save("cat_blurred.jpg")
'''
newPage = link.get('href')
print(newPage)
pages.add(newPage)
getLinks(newPage)
print "nn" + pages
getLinks("http://www.geoffstratton.com/")
![]()