Python: Webscraping With BeautifulSoup

Some experiments in website scraping using Python 2.7 with BeautifulSoup 3.2. The first function here shows various manipulations of an HTML page, including saving a scrubbed file to disk. The second function shows a simple crawler that attempts to traverse a domain and build a sitemap from hyperlinks encountered in the pages. Includes some commentary on page encoding, parsers and multiple approaches to some tasks.

from urllib2 import urlopen
from urllib2 import HTTPError
from urllib2 import URLError
from BeautifulSoup import BeautifulSoup, Comment
import re

def webScraper():
    """ Connect to remote site, grab page """
        html = urlopen("")
        #html = urlopen("")
    except HTTPError as e:
    except URLError as e:
        print("The server could not be found!")
        print("It worked!")
    No need to specify parser unless you want to override default. BS3 uses SGMLParser; 
    BS4 will use lxml if installed, otherwise defaulting to html.parser.
    soupObj = BeautifulSoup(html)
    # Print entire document
    print soupObj
    # Print first h1 tag, with tags
    print soupObj.h1
    # Print value and list of values of title tag
    print soupObj.title.string
    print soupObj.title.contents
    #---------Working with the table----------#
    # Returns entire table
    bloodTypeTable = soupObj("table")   
    print bloodTypeTable
    # Return tables whose table tag has 'summary' attribute
    bloodTypeTable = soupObj.find("table", {"summary" : re.compile(".*")})
    # Alt. method
    #bloodTypeTable = soupObj.find("table", summary=True)
    print bloodTypeTable
    # Return only the value of the summary attribute
    bloodTypeTable = soupObj.find("table", {"summary" : re.compile(".*")})["summary"]
    print bloodTypeTable
    # Return all contents of specific <table> tags
    bloodTypeTable = soupObj.find("table", summary=True).contents
    print bloodTypeTable   
    # Return all child <td> tags of specific <table> tag
    bloodTypeTable = soupObj.find("table", summary=True).findChildren("td")
    print bloodTypeTable
    # Find all the comments (<!-- -->) within a document
    comments = soupObj.findAll(text=lambda text:isinstance(text, Comment))
    print comments
    # Remove all span tags, output file to disk:
    for tag in soupObj.findAll("span"):
    print soupObj.prettify()
    if not soupObj.title.string:
        fileName = str(random.randint(1,5000) + ".html")
        fileName = soupObj.title.string + ".html"
    output = open(fileName, "w")

# Simple domain crawler that builds a sitemap from links on index page
# Consider using scrapy for this instead   
pages = set()
def getLinks(pageUrl):
    global pages
    #print pageUrl
    html = urlopen(pageUrl)
    soupObj = BeautifulSoup(html)
    for link in soupObj.findAll("a", href=re.compile("")):
        #print link
        #print link.get('href')
        if link.get('href') not in pages:
            # We found a new page
            By default, BeautifulSoup only processes text files whose contents fall
            within the ASCII control and printable character sets (codes 0-127). 
            Attempting to parse other file types with BS 3 will cause sgmllib to throw a 
            UnicodeEncodeError here, e.g., "'ascii' codec can't encode character 
            u'xff' in position 5: ordinal not in range(128)" for the JPEG marker.
            If you are parsing text other than basic ASCII, specify the original 
            encoding in the format soupObj = BeautifulSoup(html, from_encoding="iso-885908"), 
            where from_encoding is the actual page encoding. If you don't know offhand
            you can use the meta charset value. For output use print soupObj.prettify("latin-1")
            again using the correct encoding.

            For image processing, try Pillow:
            catPicture ="cat.jpg")
            catPictureBlurred = catPicture.filter(ImageFilter.GaussianBlur)
            newPage = link.get('href')

    print "nn" + pages



Leave a Reply

Your email address will not be published. Required fields are marked *