#!/usr/bin/python """Ping all traceback-eligable or pingback-elibable servers associated with hrefs found in a given blog entry Based on Sam Ruby's code http://www.intertwingly.net/blog/1138.html With tweaks from Joseph Reagle (20040212) http://reagle.org/joseph/blog//technology/python/pyblosxom-autoping """ import re, sgmllib, os, sys, urllib, xmlrpclib from xml.sax import parseString, SAXParseException from xml.sax.handler import ContentHandler def excerpt(entry_path, title, body): """ filename,entry_path,title,body => url,args Excerpt the body and urlencode the trackback arguments. """ body = re.split('(.*?)<\/div>',body)[:2][-1] body = re.sub('\n',' ',body) body = re.sub(' ',' ',body) body = re.sub('^(

\s*)?[\w\s\.]+<\/a>:\s*','',body) body = re.sub('.*?<\/em>\.?\s*','',body) body = re.sub('<.*?>','',body) body = body[:252] url = config.py['base_url'] + entry_path url = re.sub('.txt$','.html',url) arg = {} arg['url'] = url arg['title'] = title arg['blog_name'] = config.py['blog_title'] arg['excerpt'] = body return url, urllib.urlencode(arg) class link(sgmllib.SGMLParser): """ source -> list of trackbacks, list of pingbacks Parse a given html page, and retrieve the trackbacks associated with pages referenced via href found. """ def __init__(self, entry_path, title, body): sgmllib.SGMLParser.__init__(self) self.trackbacks = [] self.pingbacks = [] self.entry_path = entry_path self.title = title (self.url,self.args) = excerpt(entry_path,title,body) self.feed(body) def start_a(self, attrs): attrs = dict(attrs) if attrs.has_key('href'): try: href = attrs['href'] trackback,pingback = backrefs(href) self.trackbacks = self.trackbacks + trackback self.pingbacks = self.pingbacks + pingback except: pass tb_re=re.compile('()') pb_re=re.compile('') def backrefs(href): """ href -> ([trackbacks],[pingbacks]) Parse a given html page, and retrieve the rdf:about, X-Pingback header, or pingback link information associated with a given href. At most one is returned (in the above priority). """ base = href.split("#")[0] file = urllib.urlopen(base) info = file.info() data = file.read().replace('\n',' ') file.close() trackback = [] pingback = pb_re.findall(data)[:1] for x in tb_re.findall(data): try: parseString(x, rdf()) except SAXParseException: pass if info.has_key("X-Pingback"): pingback=[info["X-Pingback"]] if rdf.ids.has_key(href): trackback = [rdf.ids[href]] if not trackback and not pingback and href.find("#")>0: if rdf.ids.has_key(base): trackback = [rdf.ids[base]] if trackback: pingback=[] if pingback: pingback=[(href, pingback[0])] return (trackback, pingback) class rdf(ContentHandler): """ xml -> dictionary of {dc:identifier => trackback:ping|rdf:about} Parse a given html page, and retrieve the rdf:about information associated with a given href. """ ids = {} def startElement(self, name, attrs): if name == 'rdf:Description': attrs=dict(attrs) if attrs.has_key('dc:identifer'): attrs['dc:identifier'] = attrs['dc:identifer'] if attrs.has_key('dc:identifier'): if attrs.has_key('trackback:ping'): self.ids[attrs['dc:identifier']] = attrs['trackback:ping'] elif attrs.has_key('about'): self.ids[attrs['dc:identifier']] = attrs['about'] elif attrs.has_key('rdf:about'): self.ids[attrs['dc:identifier']] = attrs['rdf:about'] def trackback(parser): """ parser -> None Ping all trackbacks encountered with the url, title, blog_name, and excerpt. """ for url in parser.trackbacks: try: print "" print "*** Trackback " + url print parser.args if url.find('?tb_id=') >= 0: file=urllib.urlopen(url + "&" + parser.args) else: file=urllib.urlopen(url, parser.args) print file.read() file.close() except: pass def pingback(parser): """ parser -> None Ping all pingbacks encountered with the source and targets """ for target,server in parser.pingbacks: try: print "" print "*** Pingback " + server server=xmlrpclib.Server(server) # print server.pingback.ping(parser.url,target) except: pass if __name__ == '__main__': # the pybloxsom config module has useful variables pybloxsom_config_path = "/home/reagle/data/2web/pyblosxom-1.1/contrib/" sys.path.append(pybloxsom_config_path) import config for name in sys.argv[1:]: name_abspath = os.path.abspath(name) name_ext = os.path.splitext(name)[1] if name_abspath.startswith(config.py['datadir']): entry_path = name_abspath[len(config.py['datadir']):] try: file = open(name) body = file.read() file.close() # My htmlentryparser looks for the title in h1 if name_ext == ".html" and "htmlentryparser" in config.py['load_plugins']: title = re.search("

(.*)

",body).group(1) else: # otherwise assume the deafult text entries title = file.readline().splitlines()[0] parser = link(entry_path,title,body) trackback(parser) pingback(parser) except Exception, e: print "!!! Exception", e