#!/usr/bin/env python
import re, urllib2, sys,time
"""
download all the posts from a google groups group
usage: google_group_dl.py GROUP [ START ]
GROUP = name of google group, e.g. diybio
START = number of the topic at which to start downloading
"""
group = sys.argv[1]
group = group.lower()
print "downloading %s..." % group
topic_url = 'http://groups.google.com/group/%s/topics?&start=%s&sa=N' # group, count
topic_increment = 10 # what google ships, hopefully won't change too much
topic_start = 0
if len(sys.argv) == 3:
topic_start = int( sys.argv[2] )
topic_browse_re = '/group/%s/browse_thread/thread/([0-9a-f]+)' % group
topic_browse_re = re.compile(topic_browse_re)
topic_browse_url = 'http://groups.google.com/group/%s/browse_thread/thread/%s' # group, topic_id
topic_count_re = 'Topics 1 - %s of (\d+)' % topic_increment
topic_count_re = re.compile( topic_count_re)
google_agent = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2) Gecko/20100207 Namoroka/3.6'
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', google_agent)]
def url_open(url):
while(1):
try:
return opener.open(url)
except:
sys.stderr.write('x')
time.sleep(60)
def topic_count_get(group):
for line in url_open( topic_url % (group, 0) ):
for count in topic_count_re.finditer(line):
return int(count.group(1))
def topic_list(group, topic_count):
lis = []
for line in url_open( topic_url % (group, topic_count) ):
for topic in topic_browse_re.finditer(line):
lis.append(topic.group(1))
return lis
def topic_save(group, topic):
topic_fd = open(topic, 'w')
for line in url_open( topic_browse_url % ( group, topic) ):
topic_fd.write(line)
topic_fd.close()
topic_count = topic_count_get(group)
print "%s has %s topics" % (group, topic_count)
for count in range(topic_start, topic_count, topic_increment):
print "get'ing topics %s" % count
for t in topic_list(group, count):
topic_save( group, t )