#!/usr/bin/python
"""Ping all traceback-eligable or pingback-elibable servers associated with
hrefs found in a given blog entry
Based on Sam Ruby's code
http://www.intertwingly.net/blog/1138.html
With tweaks from Joseph Reagle (20040212)
http://reagle.org/joseph/blog//technology/python/pyblosxom-autoping
"""
import re, sgmllib, os, sys, urllib, xmlrpclib
from xml.sax import parseString, SAXParseException
from xml.sax.handler import ContentHandler
def excerpt(entry_path, title, body):
""" filename,entry_path,title,body => url,args
Excerpt the body and urlencode the trackback arguments.
"""
body = re.split('
(.*?)<\/div>',body)[:2][-1]
body = re.sub('\n',' ',body)
body = re.sub(' ',' ',body)
body = re.sub('^(
\s*)?[\w\s\.]+<\/a>:\s*','',body)
body = re.sub('.*?<\/em>\.?\s*','',body)
body = re.sub('<.*?>','',body)
body = body[:252]
url = config.py['base_url'] + entry_path
url = re.sub('.txt$','.html',url)
arg = {}
arg['url'] = url
arg['title'] = title
arg['blog_name'] = config.py['blog_title']
arg['excerpt'] = body
return url, urllib.urlencode(arg)
class link(sgmllib.SGMLParser):
""" source -> list of trackbacks, list of pingbacks
Parse a given html page, and retrieve the trackbacks associated with
pages referenced via href found.
"""
def __init__(self, entry_path, title, body):
sgmllib.SGMLParser.__init__(self)
self.trackbacks = []
self.pingbacks = []
self.entry_path = entry_path
self.title = title
(self.url,self.args) = excerpt(entry_path,title,body)
self.feed(body)
def start_a(self, attrs):
attrs = dict(attrs)
if attrs.has_key('href'):
try:
href = attrs['href']
trackback,pingback = backrefs(href)
self.trackbacks = self.trackbacks + trackback
self.pingbacks = self.pingbacks + pingback
except:
pass
tb_re=re.compile('()')
pb_re=re.compile(' ')
def backrefs(href):
""" href -> ([trackbacks],[pingbacks])
Parse a given html page, and retrieve the rdf:about, X-Pingback header,
or pingback link information associated with a given href. At most
one is returned (in the above priority).
"""
base = href.split("#")[0]
file = urllib.urlopen(base)
info = file.info()
data = file.read().replace('\n',' ')
file.close()
trackback = []
pingback = pb_re.findall(data)[:1]
for x in tb_re.findall(data):
try:
parseString(x, rdf())
except SAXParseException:
pass
if info.has_key("X-Pingback"): pingback=[info["X-Pingback"]]
if rdf.ids.has_key(href): trackback = [rdf.ids[href]]
if not trackback and not pingback and href.find("#")>0:
if rdf.ids.has_key(base): trackback = [rdf.ids[base]]
if trackback: pingback=[]
if pingback: pingback=[(href, pingback[0])]
return (trackback, pingback)
class rdf(ContentHandler):
""" xml -> dictionary of {dc:identifier => trackback:ping|rdf:about}
Parse a given html page, and retrieve the rdf:about information associated
with a given href.
"""
ids = {}
def startElement(self, name, attrs):
if name == 'rdf:Description':
attrs=dict(attrs)
if attrs.has_key('dc:identifer'):
attrs['dc:identifier'] = attrs['dc:identifer']
if attrs.has_key('dc:identifier'):
if attrs.has_key('trackback:ping'):
self.ids[attrs['dc:identifier']] = attrs['trackback:ping']
elif attrs.has_key('about'):
self.ids[attrs['dc:identifier']] = attrs['about']
elif attrs.has_key('rdf:about'):
self.ids[attrs['dc:identifier']] = attrs['rdf:about']
def trackback(parser):
""" parser -> None
Ping all trackbacks encountered with the url, title, blog_name, and
excerpt.
"""
for url in parser.trackbacks:
try:
print ""
print "*** Trackback " + url
print parser.args
if url.find('?tb_id=') >= 0:
file=urllib.urlopen(url + "&" + parser.args)
else:
file=urllib.urlopen(url, parser.args)
print file.read()
file.close()
except:
pass
def pingback(parser):
""" parser -> None
Ping all pingbacks encountered with the source and targets
"""
for target,server in parser.pingbacks:
try:
print ""
print "*** Pingback " + server
server=xmlrpclib.Server(server)
# print server.pingback.ping(parser.url,target)
except:
pass
if __name__ == '__main__':
# the pybloxsom config module has useful variables
pybloxsom_config_path = "/home/reagle/data/2web/pyblosxom-1.1/contrib/"
sys.path.append(pybloxsom_config_path)
import config
for name in sys.argv[1:]:
name_abspath = os.path.abspath(name)
name_ext = os.path.splitext(name)[1]
if name_abspath.startswith(config.py['datadir']):
entry_path = name_abspath[len(config.py['datadir']):]
try:
file = open(name)
body = file.read()
file.close()
# My htmlentryparser looks for the title in h1
if name_ext == ".html" and "htmlentryparser" in config.py['load_plugins']:
title = re.search("(.*) ",body).group(1)
else: # otherwise assume the deafult text entries
title = file.readline().splitlines()[0]
parser = link(entry_path,title,body)
trackback(parser)
pingback(parser)
except Exception, e:
print "!!! Exception", e