#!/usr/bin/env python import re, urllib2, sys,time """ download all the posts from a google groups group usage: google_group_dl.py GROUP [ START ] GROUP = name of google group, e.g. diybio START = number of the topic at which to start downloading """ group = sys.argv[1] group = group.lower() print "downloading %s..." % group topic_url = 'http://groups.google.com/group/%s/topics?&start=%s&sa=N' # group, count topic_increment = 10 # what google ships, hopefully won't change too much topic_start = 0 if len(sys.argv) == 3: topic_start = int( sys.argv[2] ) topic_browse_re = '/group/%s/browse_thread/thread/([0-9a-f]+)' % group topic_browse_re = re.compile(topic_browse_re) topic_browse_url = 'http://groups.google.com/group/%s/browse_thread/thread/%s' # group, topic_id topic_count_re = 'Topics 1 - %s of (\d+)' % topic_increment topic_count_re = re.compile( topic_count_re) google_agent = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2) Gecko/20100207 Namoroka/3.6' opener = urllib2.build_opener() opener.addheaders = [('User-agent', google_agent)] def url_open(url): while(1): try: return opener.open(url) except: sys.stderr.write('x') time.sleep(60) def topic_count_get(group): for line in url_open( topic_url % (group, 0) ): for count in topic_count_re.finditer(line): return int(count.group(1)) def topic_list(group, topic_count): lis = [] for line in url_open( topic_url % (group, topic_count) ): for topic in topic_browse_re.finditer(line): lis.append(topic.group(1)) return lis def topic_save(group, topic): topic_fd = open(topic, 'w') for line in url_open( topic_browse_url % ( group, topic) ): topic_fd.write(line) topic_fd.close() topic_count = topic_count_get(group) print "%s has %s topics" % (group, topic_count) for count in range(topic_start, topic_count, topic_increment): print "get'ing topics %s" % count for t in topic_list(group, count): topic_save( group, t )