ok bro, de todas formas si quieres mi consejo es más eficiente que uses regulares que beautifulsoap.
Si te sirve de algo nosotros hicimos un script para GRAMPUS para extraer los meta de los html y parseamos la info con regulares.
Te lo pego aqui :
[Enlace externo eliminado para invitados]
import re,sys
class ms_Meta():
def __init__(self,pathFile):
self.pathFile = pathFile
self.metaData = {"Scripts":[],"Css":[]}
self.regexps = ["<(.*)meta(.*)http-equiv(.*)=(.*)[\"\'](.+)[\"\'](.*)content(.*)=(.*)[\"\'](.+)[\"\'](.*)",
"<(.*)meta(.*)name(.*)=(.*)[\"\'](.+)[\"\'](.*)content(.*)=(.*)[\"\'](.+)[\"\'](.*)",
"<(.*)script(.*)src(.*)=(.*)[\"\'](.+?)[\"\'](.*)",
"<(.*)link(.*)href(.*)=(.*)[\"\'](.+?)[\"\'](.*)",
"<(.*)meta(.*)property(.*)=(.*)[\"\'](.+)[\"\'](.*)content(.*)=(.*)[\"\'](.+)[\"\'](.*)",
"<(.*)meta(.*)content(.*)=(.*)[\"\'](.+)[\"\'](.*)name(.*)=(.*)[\"\'](.+)[\"\'](.*)",
"<(.*)meta(.*)content(.*)=(.*)[\"\'](.+)[\"\'](.*)http-equiv(.*)=(.*)[\"\'](.+)[\"\'](.*)",
"<(.*)meta(.*)content(.*)=(.*)[\"\'](.+)[\"\'](.*)property(.*)=(.*)[\"\'](.+)[\"\'](.*)"]
self.__openFile()
self.__parserMetaName()
# This method open file and close it if produced an exception
def __openFile(self):
try:
self.fileParsed = open(self.pathFile)
except:
sys.exit(0)
# This method will launch regular expressions against html file lines, adding wanted metadata to dictionary
def __parserMetaName(self):
# We go line by line html
for linea in self.fileParsed.readlines():
for reg in self.regexps: #For every regular expression in regexps
regexp = re.match(reg,linea.strip().lower()) # RegExp against trimed and lowecased line
if reg==self.regexps[2]:
if regexp != None:
self.metaData["Scripts"].append(regexp.group(5))
elif reg==self.regexps[3]:
if regexp != None: # If the regular expression is different from None
self.metaData["Css"].append(regexp.group(5)) # Add metaData to dictionary
elif reg==self.regexps[5] or reg==self.regexps[6] or reg==self.regexps[7]:
if regexp != None:
self.metaData[regexp.group(9)] = regexp.group(5)
else:
if regexp != None:
self.metaData[regexp.group(5)] = regexp.group(9)
# Show Info
for elemento in self.metaData:
print elemento + " " + str(self.metaData[elemento])+"\n"
if __name__ == "__main__":
ms_Meta("index4.html")