<?xml version="1.0" encoding="iso-8859-1"?>

<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
<title type="text">Joseph Reagle</title>
<subtitle type="html"><![CDATA[
Open Communities, Media, Source, and Standards
]]></subtitle>
<id>http://reagle.org/joseph/blog/method/wikipedia-history-scraping</id>
<link rel="alternate" type="text/html" href="http://reagle.org/joseph/blog" />
<link rel="self" type="application/atom+xml" href="http://reagle.org/joseph/blog/method/wikipedia-history-scraping?flav=atom" />


<author>
<name>Joseph Reagle</name>
<uri>http://reagle.org/joseph/blog/method/wikipedia-history-scraping</uri>
<email></email>
</author>
<rights>Copyright 2003-2010 Joseph Reagle</rights>
<generator uri="http://pyblosxom.sourceforge.net/" version="1.4.3 01/10/2008">
PyBlosxom http://pyblosxom.sourceforge.net/ 1.4.3 01/10/2008
</generator>

<updated>2005-12-13T23:34:16Z</updated>
<!-- icon?  logo?  -->

<entry>
<title type="html">Wikipedia History Scraping</title>
<category term="" />
<id>http://reagle.org/joseph/blog/2005/12/13/wikipedia-history-scraping</id>
<updated>2005-12-13T23:34:16Z</updated>
<published>2005-12-13T23:34:16Z</published>
<link rel="alternate" type="text/html" href="http://reagle.org/joseph/blog/method/wikipedia-history-scraping.html" />
<content type="html">

&lt;p&gt;To confirm the power law in Wikipedia edits (many doing a little, a few
doing much) this regular expression and Python code parses a Wikipedia
history fairly well:&lt;/p&gt;

&lt;blockquote&gt;
  &lt;pre&gt;history_regex = r&quot;&quot;&quot;.*?oldid=(\d+).*(\d\d:\d\d.*?\d\d\d\d)&amp;lt;/a&amp;gt;.*&amp;lt;span class=&apos;history-user&apos;&amp;gt;.*?&amp;gt;(.*?)&amp;lt;/a&amp;gt;.*(?:&amp;lt;span class=&apos;comment&apos;&amp;gt;(.*?)&amp;lt;/span&amp;gt;)?&amp;lt;/li&amp;gt;&quot;&quot;&quot;
regex_obj = re.compile(history_regex)

url = sys.argv[1]
html = getHTML(url)
lines = html.split(&apos;\n&apos;)
for line in lines:
    if line.startswith(&quot;&amp;lt;li&amp;gt;(&amp;lt;a&quot;):
        counter = counter+1
        match_obj = regex_obj.search(line)
        if match_obj:
            oldid,date,author,comment = match_obj.groups()
            edits.setdefault(author,[]).append((oldid,date,author,comment))
counts = [(author,len(edits[author])) for author in edits.keys()]
counts_s = sorted(counts, reverse=True, key=operator.itemgetter(1))
print counter
for author,number in counts_s:
    print author, &quot;;&quot;, number&lt;/pre&gt;
&lt;/blockquote&gt;
</content>
</entry>
</feed>
