From 5a9447ea7b570b9c8702472ce64016d481bb55b5 Mon Sep 17 00:00:00 2001
From: Patrick Cernko <pcernko@mpi-klsb.mpg.de>
Date: Tue, 5 Oct 2021 12:13:50 +0200
Subject: [PATCH] added mailman-subscribers3.py from
 https://www.msapiro.net/scripts/mailman-subscribers3.py

---
 README.md               |   6 +
 mailman-subscribers3.py | 414 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 420 insertions(+)
 create mode 100755 mailman-subscribers3.py

diff --git a/README.md b/README.md
index 9c62c0a..206a03f 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,12 @@
 This project was cloned from:
 https://gitlab.fechner.net/mfechner/mailman2sympa.git
 
+mailman-subscriber3.py from
+https://www.msapiro.net/scripts/mailman-subscribers3.py
+see
+https://wiki.list.org/DOC/How%20do%20I%20extract%20%28export%29%20a%20list%20of%20my%20list%27s%20members%20%28subscribers%29%3F
+
+
 Initially cloned from:
 https://git.fs.lmu.de/roots/mailman2sympa
 
diff --git a/mailman-subscribers3.py b/mailman-subscribers3.py
new file mode 100755
index 0000000..3b36363
--- /dev/null
+++ b/mailman-subscribers3.py
@@ -0,0 +1,414 @@
+#!/usr/bin/env python3
+# vi: set et sw=4 st=4:
+#
+# 2004-08-27 Jim Tittsler <jwt@starship.python.net>
+# 2004-10-03 jwt    change authentication
+# 2004-10-04 jwt    remove dependency on ClientCookie
+# 2004-10-07 jwt    use getopt to retrieve host, list, password from command
+# 2004-10-10 jwt    return to using ClientCookie
+# 2004-10-13 jwt    add --fullnames option
+# 2005-02-15 jwt    switch on RFC2965 cookie support when newer version
+#                     of ClientCookie is detected
+# 2005-02-16 jwt    use Python 2.4's cookielib if it is available
+# 2005-02-27 jwt    only visit the roster page for letters that exist
+# 2005-06-04 mas    add --nomail option (Mark Sapiro <mark@msapiro.net>)
+# 2005-06-14 jwt    handle chunks of email addresses starting [0-9]*
+# 2006-01-27 mas    Retry urllib2.URLError exceptions in main loop.
+#                   Modify parser to get most of the member attributes on the
+#                     page (I don't get nomail reason because I haven't yet
+#                     figured out how, and I don't get the language option).
+#                     This provides a foundation for adding options to deal
+#                     with any of these attributes.
+# 2006-01-28 mas    Add --regular and --digest options.
+# 2006-01-29 mas    Get the nomail reason (I figured out how)
+#                   Add the --csv option intended to produce a file that can
+#                     be imported into a local spreadsheet. Mostly useful for
+#                     larger lists when multiple sublists are desired and where
+#                     multiple passes are expensive.
+# 2006-04-10 mas    Add some error checking for invalid URL (hostname),
+#                     listname and password.
+# 2006-04-11 mas    Correct test on find(). Success is '>= 0', not 'True'.
+# 2006-08-24 mas    Catch more exceptions on invalid URLs.
+#                   Add some more explaination of hostname and when
+#                     member_url might need changing.
+# 2006-09-20 Ed Lally <elally@jersey.net>
+# 2006-09-20 ejl    Add config variable for admin path (/mailman/admin/) for
+#                     sites that don't use default URLs.
+# 2006-09-21 mas    Make Ed's change a command line option.
+# 2007-05-07 mas    Acommodate possible urllib.quote()ed email addresses.
+# 2008-02-03 mas    Clarify that script works with Membership list through
+#                   2.1.10.
+#                   Fix broken --url_path option.
+# 2008-10-06 mas    Works with 2.1.11.
+#                   Handle chunks starting with other than [0-9A-Z].
+#                   Print verbose output to stderr.
+# 2008-10-07 mas    Added -U/--unhide option
+# 2008-10-09 mas    Forgot to make the unhide '.' prints conditional on
+#                   verbose. Also, csv printed "on" for members changed to
+#                   unhidden. Fixed.
+# 2011-10-24 mas    Added type to nomail selection.
+# 2012-10-20 mas    Encode real name as iso-8859-1 to avoid Unicode error
+#                   with non-ascii.
+# 2012-11-14 jak    Added support to use HTTPS (james@jameskinnaird.ca)
+# 2013-01-25 mas    Revised the help for -u.
+# 2014-11-26 mas    Tested with 2.1.18 and Python 2.7.
+#                   Updated for '401' status return for invalid password
+#                   in recent Mailman versions.
+# 2015-08-09 mas    Changed the real name encoding to make more robust.
+# 2015-08-11 mas    More changes for encodings.
+# 2015-12-04 mas    Changed error message for bad login result page.
+# 2018-10-17 mas    Converted for python3 and doc changes.
+# 2018-10-18 mas    Removed some unused code.
+#
+
+"""List the email addresses subscribed to a mailing list, fetched from web.
+
+Usage: %(PROGRAM)s [options] hostname listname password
+
+Where:
+   --output file
+   -o file
+       Write output to specified file instead of standard out.
+
+   --regular
+   -r
+       List only the regular (non-digest) members.
+
+    --digest={any|mime|plain}
+    -d {any|mime|plain}
+       List only the digest members. One of 'any', 'mime' or 'plain'
+       is required.
+       'any' lists all the digest members.
+       'mime' lists only the mime digest members.
+       'plain' lists only the plain digest members.
+
+   --fullnames
+   -f
+       Include the full names in the output.
+
+   --nomail={any|admin|bounce|user|unknown|enabled}
+   -n {any|admin|bounce|user|unknown|enabled}
+       List members based on their nomail status. One of 'any', 'admin',
+       'bounce', 'user', 'unknown' or 'enabled' is required.
+       'any' lists members with delivery disabled for any reason.
+       'admin' lists members with delivery disabled by admin.
+       'bounce' lists members with delivery disabled by bounce.
+       'user' lists members with delivery disabled by the member.
+       'unknown' lists members with delivery disabled by mailman 2.0
+       'enabled' lists members with delivery enabled.
+
+   --csv
+   -c
+       This option overrides the above four selection options and lists
+       all members, one per line, with comma separated, quoted values as
+       follows:
+          "full name" if available, else "","email address","mod",
+          "hide","nomail" ("off" or "[A]" or "[B]" or "[U]" or "[?]"),
+          "ack","not metoo","nodupes","digest","plain"
+       analogous to the admin membership list (the values of the 'checkbox'
+       fields are either "off" or "on"). A title line with the above names
+       is listed before the member lines.
+
+   --url_path path
+   -u path
+       If the list admin pages are accessed at your site via a URL of form
+       different from http://hostname/mailman/admin/listname, you need to
+       specify the path portion of the URL that is between hostname and
+       /listname with this option. For example, a URL such as
+       http://hostname/admin.cgi/listname requires the option
+       --url_path /admin.cgi
+       or
+       -u /admin.cgi
+       and a URL like http://hostname/cgi-bin/mailman/admin/listname
+       requires the option
+       --url_path /cgi-bin/mailman/admin
+       or
+       -u /cgi-bin/mailman/admin
+       Default value is /mailman/admin.
+
+   --unhide
+   -U
+       Set the 'hidden' flag off for all list members including those not
+       selected for output.  This will take a long time if there are a lot
+       of hidden members.  The -v option prints '.' after every 100 unhides.
+       This option is only effective with Mailman versions up to 2.1.22
+       because it doesn't account for CSRF checks introduced in 2.1.23.
+
+   --ssl
+   -s
+       Use https instead of http for accessing the list.
+
+   --verbose
+   -v
+       Include extra progress output.
+
+   --help
+   -h
+       Print this help message and exit
+
+   hostname is the name used in the URL of the list's web interface
+   listname is the name of the mailing list
+   password is the list's admin password
+
+   The list of subscribers is fetched from the web administrative
+   interface.  Using the bin/list_members program from a shell
+   account is preferable, but not always available.
+
+   Tested with the Mailman 2.1.5 - 2.1.34 Membership list layout, but the
+   --unhide option only works up to 2.1.22.
+
+   This script runs on your workstation and requires that you have Python
+   <http://www.python.org> installed. This version is for Python 3.
+"""
+
+import sys
+import re
+import string
+import urllib.request, urllib.parse, urllib.error
+import getopt
+import http.client
+from time import sleep
+from html.parser import HTMLParser
+import http.cookiejar
+policy = http.cookiejar.DefaultCookiePolicy(rfc2965 = True)
+cookiejar = http.cookiejar.CookieJar(policy)
+opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookiejar)).open
+
+PROGRAM = sys.argv[0]
+
+def usage(code, msg=''):
+    if code:
+        fd = sys.stderr
+    else:
+        fd = sys.stdout
+    print(__doc__ % globals(), file=fd)
+    if msg:
+        print(msg, file=fd)
+    sys.exit(code)
+
+subscribers = {}
+vnames = ['_realname', '_mod', '_hide', '_nomail', '_ack', '_notmetoo',
+          '_nodupes', '_digest', '_plain']
+maxchunk = 0
+letters = ['0']
+processed_letters = []
+gotnomail = False
+
+class MailmanHTMLParser(HTMLParser):
+    '''cheap way to find email addresses and pages with multiple
+       chunks from Mailman 2.1.5 membership pages'''
+    def handle_starttag(self, tag, attrs):
+        global maxchunk, letters, gotnomail, subemail, url_path
+        if tag == 'input':
+            for vname in vnames:
+                s = False
+                for a,v in attrs:
+                    if a == 'name' and v.endswith(vname):
+                        subemail = v[:-len(vname)]
+                        s = True
+                    elif a == 'value':
+                        subval = v
+                if s:
+                    if subemail not in subscribers:
+                        subscribers[subemail] = {}
+                    if vname == '_nomail' and subval == "on":
+                        gotnomail = True
+                    else:
+                        if not isinstance(subval, str):
+                            subval = subval.decode(page_cset, 'replace')
+                        subscribers[subemail][vname] = subval
+        if tag == 'a':
+            for a,v in attrs:
+                if a == 'href' and v.find("%s/" % (url_path)) >= 0:
+                    m = re.search(r'chunk=(?P<chunkno>\d+)', v, re.I)
+                    if m:
+                        if int(m.group('chunkno')) > maxchunk:
+                            maxchunk = int(m.group('chunkno'))
+                    m = re.search(r'letter=(?P<letter>.)', v, re.I)
+                    if m:
+                        letter = m.group('letter')
+                        if letter not in letters + processed_letters:
+                            letters.append(letter)
+
+    def handle_data(self, data):
+        global gotnomail, subemail
+        if gotnomail:
+            gotnomail = False
+            subscribers[subemail]['_nomail'] = data
+
+def main():
+    global maxchunk, letters, url_path, page_cset
+    try:
+        opts, args = getopt.getopt(sys.argv[1:], "ho:rd:fn:cu:Uvs",
+                ["help", "output=", "regular", "digest=", "fullnames",
+                 "nomail=", "csv", "url_path=", "unhide", "verbose",
+                 "ssl"])
+    except:
+        usage(2)
+    fp = sys.stdout
+    fullnames = False
+    nomail = None
+    verbose = False
+    regular = False
+    digest = None
+    csv = False
+    unhide = False
+    protocol = 'http'
+    url_path = '/mailman/admin'
+    for o,a in opts:
+        if o in ("-v", "--verbose"):
+            verbose = True
+        if o in ("-h", "--help"):
+            usage(0)
+        if o in ("-o", "--output"):
+            fp = open(a, "wt")
+        if o in ("-f", "--fullnames"):
+            fullnames = True
+        if o in ("-n", "--nomail"):
+            nomail = a.lower()
+        if o in ("-r", "--regular"):
+            regular = True
+        if o in ("-d", "--digest"):
+            digest = a.lower()
+        if o in ("-c", "--csv"):
+            csv = True
+        if o in ("-u", "--url_path"):
+            url_path = a
+        if o in ("-U", "--unhide"):
+            unhide = True
+        if o in ("-s", "--ssl"):
+            protocol = 'https'
+    if regular and digest:
+        usage(2, "Both 'regular' and 'digest' will produce an empty list.")
+    if digest not in [None, 'any', 'mime', 'plain']:
+        usage(2, "Digest type %s unrecognized" % digest)
+    if nomail not in [None, 'any', 'admin', 'bounce', 'user', 'unknown',
+                      'enabled']:
+        usage(2, "Nomail type %s unrecognized" % nomail)
+    if len(args) != 3:
+        usage(2)
+
+    member_url = '%s://%s%s/%s/members' % (protocol, args[0], url_path,
+                                           args[1])
+    options_url = '%s://%s%s/%s' % (protocol, args[0],
+                                    re.sub('admin', 'options', url_path),
+                                    args[1])
+    p = {'adminpw':args[2]}
+    # login, picking up the cookie
+    try:
+        page = opener(member_url, urllib.parse.urlencode(p).encode('us-ascii'))
+    except (urllib.error.URLError, http.client.InvalidURL) as e:
+        if isinstance(e, urllib.error.HTTPError) and e.code == 401:
+            usage(1, 'Invalid password.')
+        else:
+            usage(1, """Error accessing %s
+Supplied host or listname may be incorrect,
+or you may need to specify --url_path.
+""" % (member_url))
+
+    # Get the charset of the page, but use iso-8859-1 for ascii or None.
+    page_cset = page.info().get_param('charset') or 'iso-8859-1'
+    if page_cset.lower().endswith('ascii'):
+        page_cset = 'iso-8859-1'
+    lines = page.read().decode(page_cset, errors='replace')
+    page.close()
+    p = {}
+    # Try to recognize the returned page independent of the list language
+    if re.search(r'INPUT\s+type="SUBMIT"\s+name="admlogin"', lines,
+                 re.M + re.I):
+        # login page - invalid password
+        usage(1,
+          'Login invalid - possibly incorrect password or missing -s option.')
+    if not re.search(r'<form\s+action=', lines, re.M + re.I):
+        # no <form> tag - admin overview page
+        usage(1, """Non-existent list: %s.
+If the provided list name is valid, the supplied host may be incorrect
+or you may need to specify --url_path.
+""" % args[1])
+
+    # loop through the letters, and all chunks of each
+    while len(letters) > 0:
+        letter = letters[0]
+        letters = letters[1:]
+        processed_letters.append(letter)
+        chunk = 0
+        maxchunk = 0
+        while chunk <= maxchunk:
+            if verbose:
+                print("%c(%d)" % (letter, chunk), file=sys.stderr)
+            while True:
+                try:
+                    page = opener(member_url + "?letter=%s&chunk=%d" %
+                            (letter, chunk))
+                    lines = page.read().decode(page_cset, errors='replace')
+                    page.close()
+                except urllib.error.URLError:
+                    if verbose:
+                        print('Error encountered in accessing web page.',\
+                            'Retrying.', file=sys.stderr)
+                    sleep(2)
+                else:
+                    break
+
+            parser = MailmanHTMLParser()
+            parser.feed(lines)
+            parser.close()
+            chunk += 1
+
+    subscriberlist = list(subscribers.items())
+    subscriberlist.sort()
+
+    # print the subscribers list
+    if csv:
+        print('"Full name","email address","mod","hide",\
+"nomail","ack","not metoo","nodupes","digest","plain"', file=fp)
+
+    nunhide = 0
+    for (email, d) in subscriberlist:
+        if unhide and d['_hide'] == "on":
+            params = urllib.parse.urlencode({'conceal':0,
+                                       'options-submit':1})
+            u = opener("%s/%s" % (options_url, email),
+                       params.encode('us-ascii'))
+            u.close()
+            d['_hide'] = "off"
+            nunhide += 1
+            if verbose and nunhide % 100 == 0:
+                print('.', end=' ', file=sys.stderr)
+        email = urllib.parse.unquote(email)
+        if csv:
+            print('"%s","%s","%s","%s","%s","%s","%s","%s","%s","%s"'\
+                 % (d['_realname'], email, d['_mod'], d['_hide'],
+                    d['_nomail'], d['_ack'], d['_notmetoo'],
+                    d['_nodupes'], d['_digest'], d['_plain']), file=fp)
+            continue
+        if nomail == 'enabled' and d['_nomail'] != "off":
+            continue
+        if nomail == 'any' and d['_nomail'] == "off":
+            continue
+        if nomail == 'admin' and d['_nomail'] != "[A]":
+            continue
+        if nomail == 'bounce' and d['_nomail'] != "[B]":
+            continue
+        if nomail == 'user' and d['_nomail'] != "[U]":
+            continue
+        if nomail == 'unknown' and d['_nomail'] != "[?]":
+            continue
+        if regular and d['_digest'] == "on":
+            continue
+        if digest and d['_digest'] == "off":
+            continue
+        if digest == "mime" and d['_plain'] == "on":
+            continue
+        if digest == "plain" and d['_plain'] == "off":
+            continue
+        if not fullnames or d['_realname'] == "":
+            print(email, file=fp)
+        else:
+            print('%s <%s>' % (d['_realname'], email), file=fp)
+
+    fp.close()
+
+
+if __name__ == '__main__':
+    main()
-- 
GitLab