From 5a9447ea7b570b9c8702472ce64016d481bb55b5 Mon Sep 17 00:00:00 2001 From: Patrick Cernko <pcernko@mpi-klsb.mpg.de> Date: Tue, 5 Oct 2021 12:13:50 +0200 Subject: [PATCH] added mailman-subscribers3.py from https://www.msapiro.net/scripts/mailman-subscribers3.py --- README.md | 6 + mailman-subscribers3.py | 414 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 420 insertions(+) create mode 100755 mailman-subscribers3.py diff --git a/README.md b/README.md index 9c62c0a..206a03f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,12 @@ This project was cloned from: https://gitlab.fechner.net/mfechner/mailman2sympa.git +mailman-subscriber3.py from +https://www.msapiro.net/scripts/mailman-subscribers3.py +see +https://wiki.list.org/DOC/How%20do%20I%20extract%20%28export%29%20a%20list%20of%20my%20list%27s%20members%20%28subscribers%29%3F + + Initially cloned from: https://git.fs.lmu.de/roots/mailman2sympa diff --git a/mailman-subscribers3.py b/mailman-subscribers3.py new file mode 100755 index 0000000..3b36363 --- /dev/null +++ b/mailman-subscribers3.py @@ -0,0 +1,414 @@ +#!/usr/bin/env python3 +# vi: set et sw=4 st=4: +# +# 2004-08-27 Jim Tittsler <jwt@starship.python.net> +# 2004-10-03 jwt change authentication +# 2004-10-04 jwt remove dependency on ClientCookie +# 2004-10-07 jwt use getopt to retrieve host, list, password from command +# 2004-10-10 jwt return to using ClientCookie +# 2004-10-13 jwt add --fullnames option +# 2005-02-15 jwt switch on RFC2965 cookie support when newer version +# of ClientCookie is detected +# 2005-02-16 jwt use Python 2.4's cookielib if it is available +# 2005-02-27 jwt only visit the roster page for letters that exist +# 2005-06-04 mas add --nomail option (Mark Sapiro <mark@msapiro.net>) +# 2005-06-14 jwt handle chunks of email addresses starting [0-9]* +# 2006-01-27 mas Retry urllib2.URLError exceptions in main loop. +# Modify parser to get most of the member attributes on the +# page (I don't get nomail reason because I haven't yet +# figured out how, and I don't get the language option). +# This provides a foundation for adding options to deal +# with any of these attributes. +# 2006-01-28 mas Add --regular and --digest options. +# 2006-01-29 mas Get the nomail reason (I figured out how) +# Add the --csv option intended to produce a file that can +# be imported into a local spreadsheet. Mostly useful for +# larger lists when multiple sublists are desired and where +# multiple passes are expensive. +# 2006-04-10 mas Add some error checking for invalid URL (hostname), +# listname and password. +# 2006-04-11 mas Correct test on find(). Success is '>= 0', not 'True'. +# 2006-08-24 mas Catch more exceptions on invalid URLs. +# Add some more explaination of hostname and when +# member_url might need changing. +# 2006-09-20 Ed Lally <elally@jersey.net> +# 2006-09-20 ejl Add config variable for admin path (/mailman/admin/) for +# sites that don't use default URLs. +# 2006-09-21 mas Make Ed's change a command line option. +# 2007-05-07 mas Acommodate possible urllib.quote()ed email addresses. +# 2008-02-03 mas Clarify that script works with Membership list through +# 2.1.10. +# Fix broken --url_path option. +# 2008-10-06 mas Works with 2.1.11. +# Handle chunks starting with other than [0-9A-Z]. +# Print verbose output to stderr. +# 2008-10-07 mas Added -U/--unhide option +# 2008-10-09 mas Forgot to make the unhide '.' prints conditional on +# verbose. Also, csv printed "on" for members changed to +# unhidden. Fixed. +# 2011-10-24 mas Added type to nomail selection. +# 2012-10-20 mas Encode real name as iso-8859-1 to avoid Unicode error +# with non-ascii. +# 2012-11-14 jak Added support to use HTTPS (james@jameskinnaird.ca) +# 2013-01-25 mas Revised the help for -u. +# 2014-11-26 mas Tested with 2.1.18 and Python 2.7. +# Updated for '401' status return for invalid password +# in recent Mailman versions. +# 2015-08-09 mas Changed the real name encoding to make more robust. +# 2015-08-11 mas More changes for encodings. +# 2015-12-04 mas Changed error message for bad login result page. +# 2018-10-17 mas Converted for python3 and doc changes. +# 2018-10-18 mas Removed some unused code. +# + +"""List the email addresses subscribed to a mailing list, fetched from web. + +Usage: %(PROGRAM)s [options] hostname listname password + +Where: + --output file + -o file + Write output to specified file instead of standard out. + + --regular + -r + List only the regular (non-digest) members. + + --digest={any|mime|plain} + -d {any|mime|plain} + List only the digest members. One of 'any', 'mime' or 'plain' + is required. + 'any' lists all the digest members. + 'mime' lists only the mime digest members. + 'plain' lists only the plain digest members. + + --fullnames + -f + Include the full names in the output. + + --nomail={any|admin|bounce|user|unknown|enabled} + -n {any|admin|bounce|user|unknown|enabled} + List members based on their nomail status. One of 'any', 'admin', + 'bounce', 'user', 'unknown' or 'enabled' is required. + 'any' lists members with delivery disabled for any reason. + 'admin' lists members with delivery disabled by admin. + 'bounce' lists members with delivery disabled by bounce. + 'user' lists members with delivery disabled by the member. + 'unknown' lists members with delivery disabled by mailman 2.0 + 'enabled' lists members with delivery enabled. + + --csv + -c + This option overrides the above four selection options and lists + all members, one per line, with comma separated, quoted values as + follows: + "full name" if available, else "","email address","mod", + "hide","nomail" ("off" or "[A]" or "[B]" or "[U]" or "[?]"), + "ack","not metoo","nodupes","digest","plain" + analogous to the admin membership list (the values of the 'checkbox' + fields are either "off" or "on"). A title line with the above names + is listed before the member lines. + + --url_path path + -u path + If the list admin pages are accessed at your site via a URL of form + different from http://hostname/mailman/admin/listname, you need to + specify the path portion of the URL that is between hostname and + /listname with this option. For example, a URL such as + http://hostname/admin.cgi/listname requires the option + --url_path /admin.cgi + or + -u /admin.cgi + and a URL like http://hostname/cgi-bin/mailman/admin/listname + requires the option + --url_path /cgi-bin/mailman/admin + or + -u /cgi-bin/mailman/admin + Default value is /mailman/admin. + + --unhide + -U + Set the 'hidden' flag off for all list members including those not + selected for output. This will take a long time if there are a lot + of hidden members. The -v option prints '.' after every 100 unhides. + This option is only effective with Mailman versions up to 2.1.22 + because it doesn't account for CSRF checks introduced in 2.1.23. + + --ssl + -s + Use https instead of http for accessing the list. + + --verbose + -v + Include extra progress output. + + --help + -h + Print this help message and exit + + hostname is the name used in the URL of the list's web interface + listname is the name of the mailing list + password is the list's admin password + + The list of subscribers is fetched from the web administrative + interface. Using the bin/list_members program from a shell + account is preferable, but not always available. + + Tested with the Mailman 2.1.5 - 2.1.34 Membership list layout, but the + --unhide option only works up to 2.1.22. + + This script runs on your workstation and requires that you have Python + <http://www.python.org> installed. This version is for Python 3. +""" + +import sys +import re +import string +import urllib.request, urllib.parse, urllib.error +import getopt +import http.client +from time import sleep +from html.parser import HTMLParser +import http.cookiejar +policy = http.cookiejar.DefaultCookiePolicy(rfc2965 = True) +cookiejar = http.cookiejar.CookieJar(policy) +opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookiejar)).open + +PROGRAM = sys.argv[0] + +def usage(code, msg=''): + if code: + fd = sys.stderr + else: + fd = sys.stdout + print(__doc__ % globals(), file=fd) + if msg: + print(msg, file=fd) + sys.exit(code) + +subscribers = {} +vnames = ['_realname', '_mod', '_hide', '_nomail', '_ack', '_notmetoo', + '_nodupes', '_digest', '_plain'] +maxchunk = 0 +letters = ['0'] +processed_letters = [] +gotnomail = False + +class MailmanHTMLParser(HTMLParser): + '''cheap way to find email addresses and pages with multiple + chunks from Mailman 2.1.5 membership pages''' + def handle_starttag(self, tag, attrs): + global maxchunk, letters, gotnomail, subemail, url_path + if tag == 'input': + for vname in vnames: + s = False + for a,v in attrs: + if a == 'name' and v.endswith(vname): + subemail = v[:-len(vname)] + s = True + elif a == 'value': + subval = v + if s: + if subemail not in subscribers: + subscribers[subemail] = {} + if vname == '_nomail' and subval == "on": + gotnomail = True + else: + if not isinstance(subval, str): + subval = subval.decode(page_cset, 'replace') + subscribers[subemail][vname] = subval + if tag == 'a': + for a,v in attrs: + if a == 'href' and v.find("%s/" % (url_path)) >= 0: + m = re.search(r'chunk=(?P<chunkno>\d+)', v, re.I) + if m: + if int(m.group('chunkno')) > maxchunk: + maxchunk = int(m.group('chunkno')) + m = re.search(r'letter=(?P<letter>.)', v, re.I) + if m: + letter = m.group('letter') + if letter not in letters + processed_letters: + letters.append(letter) + + def handle_data(self, data): + global gotnomail, subemail + if gotnomail: + gotnomail = False + subscribers[subemail]['_nomail'] = data + +def main(): + global maxchunk, letters, url_path, page_cset + try: + opts, args = getopt.getopt(sys.argv[1:], "ho:rd:fn:cu:Uvs", + ["help", "output=", "regular", "digest=", "fullnames", + "nomail=", "csv", "url_path=", "unhide", "verbose", + "ssl"]) + except: + usage(2) + fp = sys.stdout + fullnames = False + nomail = None + verbose = False + regular = False + digest = None + csv = False + unhide = False + protocol = 'http' + url_path = '/mailman/admin' + for o,a in opts: + if o in ("-v", "--verbose"): + verbose = True + if o in ("-h", "--help"): + usage(0) + if o in ("-o", "--output"): + fp = open(a, "wt") + if o in ("-f", "--fullnames"): + fullnames = True + if o in ("-n", "--nomail"): + nomail = a.lower() + if o in ("-r", "--regular"): + regular = True + if o in ("-d", "--digest"): + digest = a.lower() + if o in ("-c", "--csv"): + csv = True + if o in ("-u", "--url_path"): + url_path = a + if o in ("-U", "--unhide"): + unhide = True + if o in ("-s", "--ssl"): + protocol = 'https' + if regular and digest: + usage(2, "Both 'regular' and 'digest' will produce an empty list.") + if digest not in [None, 'any', 'mime', 'plain']: + usage(2, "Digest type %s unrecognized" % digest) + if nomail not in [None, 'any', 'admin', 'bounce', 'user', 'unknown', + 'enabled']: + usage(2, "Nomail type %s unrecognized" % nomail) + if len(args) != 3: + usage(2) + + member_url = '%s://%s%s/%s/members' % (protocol, args[0], url_path, + args[1]) + options_url = '%s://%s%s/%s' % (protocol, args[0], + re.sub('admin', 'options', url_path), + args[1]) + p = {'adminpw':args[2]} + # login, picking up the cookie + try: + page = opener(member_url, urllib.parse.urlencode(p).encode('us-ascii')) + except (urllib.error.URLError, http.client.InvalidURL) as e: + if isinstance(e, urllib.error.HTTPError) and e.code == 401: + usage(1, 'Invalid password.') + else: + usage(1, """Error accessing %s +Supplied host or listname may be incorrect, +or you may need to specify --url_path. +""" % (member_url)) + + # Get the charset of the page, but use iso-8859-1 for ascii or None. + page_cset = page.info().get_param('charset') or 'iso-8859-1' + if page_cset.lower().endswith('ascii'): + page_cset = 'iso-8859-1' + lines = page.read().decode(page_cset, errors='replace') + page.close() + p = {} + # Try to recognize the returned page independent of the list language + if re.search(r'INPUT\s+type="SUBMIT"\s+name="admlogin"', lines, + re.M + re.I): + # login page - invalid password + usage(1, + 'Login invalid - possibly incorrect password or missing -s option.') + if not re.search(r'<form\s+action=', lines, re.M + re.I): + # no <form> tag - admin overview page + usage(1, """Non-existent list: %s. +If the provided list name is valid, the supplied host may be incorrect +or you may need to specify --url_path. +""" % args[1]) + + # loop through the letters, and all chunks of each + while len(letters) > 0: + letter = letters[0] + letters = letters[1:] + processed_letters.append(letter) + chunk = 0 + maxchunk = 0 + while chunk <= maxchunk: + if verbose: + print("%c(%d)" % (letter, chunk), file=sys.stderr) + while True: + try: + page = opener(member_url + "?letter=%s&chunk=%d" % + (letter, chunk)) + lines = page.read().decode(page_cset, errors='replace') + page.close() + except urllib.error.URLError: + if verbose: + print('Error encountered in accessing web page.',\ + 'Retrying.', file=sys.stderr) + sleep(2) + else: + break + + parser = MailmanHTMLParser() + parser.feed(lines) + parser.close() + chunk += 1 + + subscriberlist = list(subscribers.items()) + subscriberlist.sort() + + # print the subscribers list + if csv: + print('"Full name","email address","mod","hide",\ +"nomail","ack","not metoo","nodupes","digest","plain"', file=fp) + + nunhide = 0 + for (email, d) in subscriberlist: + if unhide and d['_hide'] == "on": + params = urllib.parse.urlencode({'conceal':0, + 'options-submit':1}) + u = opener("%s/%s" % (options_url, email), + params.encode('us-ascii')) + u.close() + d['_hide'] = "off" + nunhide += 1 + if verbose and nunhide % 100 == 0: + print('.', end=' ', file=sys.stderr) + email = urllib.parse.unquote(email) + if csv: + print('"%s","%s","%s","%s","%s","%s","%s","%s","%s","%s"'\ + % (d['_realname'], email, d['_mod'], d['_hide'], + d['_nomail'], d['_ack'], d['_notmetoo'], + d['_nodupes'], d['_digest'], d['_plain']), file=fp) + continue + if nomail == 'enabled' and d['_nomail'] != "off": + continue + if nomail == 'any' and d['_nomail'] == "off": + continue + if nomail == 'admin' and d['_nomail'] != "[A]": + continue + if nomail == 'bounce' and d['_nomail'] != "[B]": + continue + if nomail == 'user' and d['_nomail'] != "[U]": + continue + if nomail == 'unknown' and d['_nomail'] != "[?]": + continue + if regular and d['_digest'] == "on": + continue + if digest and d['_digest'] == "off": + continue + if digest == "mime" and d['_plain'] == "on": + continue + if digest == "plain" and d['_plain'] == "off": + continue + if not fullnames or d['_realname'] == "": + print(email, file=fp) + else: + print('%s <%s>' % (d['_realname'], email), file=fp) + + fp.close() + + +if __name__ == '__main__': + main() -- GitLab