Some experiments in website scraping using Python 2.7 with BeautifulSoup 3.2. The first function here shows various manipulations of an HTML page, including saving a scrubbed file to disk. The second function shows a simple crawler that attempts to traverse a domain and build a sitemap from hyperlinks encountered in the pages. Includes some commentary on page encoding, parsers and multiple approaches to some tasks.
from urllib2 import urlopen from urllib2 import HTTPError from urllib2 import URLError from BeautifulSoup import BeautifulSoup, Comment import re def webScraper(): """ Connect to remote site, grab page """ try: html = urlopen("http://mysafeinfo.com/api/data?list=bloodtypesdistlist&format=html") #html = urlopen("http://monasterphoto.com/stones/stones_prev22.htm") except HTTPError as e: print(e) except URLError as e: print("The server could not be found!") else: print("It worked!") ''' No need to specify parser unless you want to override default. BS3 uses SGMLParser; BS4 will use lxml if installed, otherwise defaulting to html.parser. ''' soupObj = BeautifulSoup(html) # Print entire document print soupObj # Print first h1 tag, with tags print soupObj.h1 # Print value and list of values of title tag print soupObj.title.string print soupObj.title.contents #---------Working with the table----------# # Returns entire table bloodTypeTable = soupObj("table") print bloodTypeTable # Return tables whose table tag has 'summary' attribute bloodTypeTable = soupObj.find("table", {"summary" : re.compile(".*")}) # Alt. method #bloodTypeTable = soupObj.find("table", summary=True) print bloodTypeTable # Return only the value of the summary attribute bloodTypeTable = soupObj.find("table", {"summary" : re.compile(".*")})["summary"] print bloodTypeTable # Return all contents of specific <table> tags bloodTypeTable = soupObj.find("table", summary=True).contents print bloodTypeTable # Return all child <td> tags of specific <table> tag bloodTypeTable = soupObj.find("table", summary=True).findChildren("td") print bloodTypeTable #-----------------------------------------# # Find all the comments (<!-- -->) within a document comments = soupObj.findAll(text=lambda text:isinstance(text, Comment)) print comments # Remove all span tags, output file to disk: for tag in soupObj.findAll("span"): tag.decompose() print soupObj.prettify() if not soupObj.title.string: fileName = str(random.randint(1,5000) + ".html") else: fileName = soupObj.title.string + ".html" output = open(fileName, "w") output.write(soupObj.prettify()) output.close() webScraper() # Simple domain crawler that builds a sitemap from links on index page # Consider using scrapy for this instead pages = set() def getLinks(pageUrl): global pages #print pageUrl html = urlopen(pageUrl) soupObj = BeautifulSoup(html) for link in soupObj.findAll("a", href=re.compile("geoffstratton.com")): #print link #print link.get('href') if link.get('href') not in pages: # We found a new page ''' By default, BeautifulSoup only processes text files whose contents fall within the ASCII control and printable character sets (codes 0-127). Attempting to parse other file types with BS 3 will cause sgmllib to throw a UnicodeEncodeError here, e.g., "'ascii' codec can't encode character u'xff' in position 5: ordinal not in range(128)" for the JPEG marker. If you are parsing text other than basic ASCII, specify the original encoding in the format soupObj = BeautifulSoup(html, from_encoding="iso-885908"), where from_encoding is the actual page encoding. If you don't know offhand you can use the meta charset value. For output use print soupObj.prettify("latin-1") again using the correct encoding. For image processing, try Pillow: catPicture = Image.open("cat.jpg") catPictureBlurred = catPicture.filter(ImageFilter.GaussianBlur) catPictureBlurred.save("cat_blurred.jpg") ''' newPage = link.get('href') print(newPage) pages.add(newPage) getLinks(newPage) print "nn" + pages getLinks("http://www.geoffstratton.com/")