Python - WebData
-1 urlib Example
import urllib.request
import json
from html.parser import HTMLParser
class
def main1():
webUrl = urllib.request.urlopen("http://www.baidu.com")
print(str(webUrl.getcode()))
data = webUrl.read()
print(data)
-2 Parser HTML Example
#
# Example file for parsing and processing HTML
#
# import the HTMLParser module
# in Python 3 you need to import from html.parser
from html.parser import HTMLParser
metacount = 0;
# create a subclass of HTMLParser and override the handler methods
class MyHTMLParser(HTMLParser):
# function to handle an opening tag in the doc
# this will be called when the closing ">" of the tag is reached
def handle_starttag(self, tag, attrs):
global metacount
if tag == "meta":
metacount += 1
print ("Encountered a start tag:", tag)
pos = self.getpos() # returns a tuple indication line and character
print ("\tAt line: ", pos[0], " position ", pos[1])
if attrs.__len__() > 0:
print ("\tAttributes:")
for a in attrs:
print ("\t", a[0],"=",a[1])
# function to handle the ending tag
def handle_endtag(self, tag):
print ("Encountered an end tag:", tag)
pos = self.getpos()
print ("\tAt line: ", pos[0], " position ", pos[1])
# function to handle character and text data (tag contents)
def handle_data(self, data):
if (data.isspace()):
return
print ("Encountered some text data:", data)
pos = self.getpos()
print ("\tAt line: ", pos[0], " position ", pos[1])
# function to handle the processing of HTML comments
def handle_comment(self, data):
print ("Encountered comment:", data)
pos = self.getpos()
print ("\tAt line: ", pos[0], " position ", pos[1])
def main():
# instantiate the parser and feed it some HTML
parser = MyHTMLParser()
# open the sample HTML file and read it
f = open("samplehtml.html")
if f.mode == "r":
contents = f.read() # read the entire file
parser.feed(contents)
print ("%d meta tags encountered" % metacount)
if __name__ == "__main__":
main();
-2 Parser XML Example
#
# Example file for parsing and processing XML
#
import xml.dom.minidom
def main():
# use the parse() function to load and parse an XML file
doc = xml.dom.minidom.parse("samplexml.xml")
# print out the document node and the name of the first child tag
print (doc.nodeName)
print (doc.firstChild.tagName)
# get a list of XML tags from the document and print each one
skills = doc.getElementsByTagName("skill")
print ("%d skills:" % skills.length)
for skill in skills:
print (skill.getAttribute("name"))
# create a new XML tag and add it into the document
newSkill = doc.createElement("skill")
newSkill.setAttribute("name", "jQuery")
doc.firstChild.appendChild(newSkill)
skills = doc.getElementsByTagName("skill")
print ("%d skills:" % skills.length)
for skill in skills:
print (skill.getAttribute("name"))
if __name__ == "__main__":
main();