Douglas Bagnall on Wed, 4 Sep 2019 11:49:17 +0200 (CEST)


[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]

Re: <nettime> The effect of "Nettime is in bad shape" on user agent ratios


On 4/09/19 4:41 am, John Preston wrote:
> Thanks Douglas. I like this. I would like to play with this on a
> wider scale (listiverse). Do you have a script to scrape out the
> headers from the archived messages or something?

Not if you mean web archives. If you mean mbox files, then yes -- last
night's script is below. If you wish to compare time periods you will
need to cut up the mbox file yourself.

Douglas

----------------8<------countmuas.py-----------------------------------
#!/usr/bin/python3
"""Count user agent headers in mbox files

USAGE: python3 countmuas.py MBOX [MBOX [...]]

The order in which the results are presented depends on the overall
counts, thus the output of

    countmuas.py A B C

is likely to look different from

    countmuas.py A; countmuas.py B; countmuas.py C

User-agent and X-Mailer headers are used where available; further
heuristics attempt to distinguish webmail providers.
"""
import mailbox
import sys
from collections import Counter
import re

is_google = re.compile('^(x-gm-|x-google)', re.I).match
is_microsoft = re.compile('^(x-ms-|x-microsoft)', re.I).match

def count_user_agents(mbox):
    m = mailbox.mbox(mbox)
    headers = Counter()
    for k, msg in m.items():
        headers.update(x.lower() for x in msg.keys())
    ua_counts = Counter()

    for k, msg in m.items():
        x = []

        if any(is_microsoft(h) for h in msg.keys()):
            x.append('microsoft')

        if any(is_google(h) for h in msg.keys()):
            x.append('gmail')

        ua = msg.get('User-Agent')
        if ua:
            ua = re.sub(r'[\d.]\w$', '', ua)
            ua = re.sub(r'\d\w?[\d.]*', '', ua)
            x.append(ua)
        xm = msg.get('X-Mailer')
        if xm:
            xm = re.sub(r'\d+[\d.]*\w?[\d.]*', '', xm)
            x.append(xm)
        s = '|'.join(x) or "unknown"
        s = re.sub(r'\s+', ' ', s).strip()
        ua_counts[s] += 1

    clean = Counter()
    for ua, count in ua_counts.most_common():
        ua = re.sub(r'[^\w ]+', '', ua).lower()
        ua = ua.strip()
        if any( x in ua for x in ('ymailnorrin',
                                  'aolwebmail',
                                  'yahoomail')):
            ua = 'yahoo/aol'
        elif 'thunderbird' in ua:
            for o in ('linux', 'macintosh', 'windows'):
                if o in ua:
                    ua = 'Thunderbird (%s)' % o.title()
        elif 'mew version on emacs' in ua:
            ua = 'Mew (Emacs)'
        elif 'cyrusjmap' in ua:
            ua = 'Cyrus webmail'
        elif 'jaro mail' in ua:
            ua = 'Jaro Mail'
        elif 'trojita' in ua:
            ua = 'Trojita'
        elif 'xsll' in ua:
            ua = 'XS4all Webmail'
        elif 'claws mail' in ua:
            ua = 'Claws Mail'
        elif ua in ('microsoft', 'microsoftgmail'):
            ua = 'MS/Outlook.com/Hotmail'
        elif ua == 'gmail':
            ua = 'Gmail'
        else:
            ua = ua.replace('gmail', '')
            ua = re.sub(' ?deb$', '', ua)
            ua = re.sub(r' version\s*$', '', ua)
            ua = ua.title()
        clean[ua] += count
    return clean


def print_user_agents(counts, names=None):
    if names is None:
        names = sorted(list(counts.keys()))
    total = sum(counts.values())
    for ua in names:
        count = counts[ua]
        percent = (count * 100.0 / total)
        print("|%-30s   %4.1f%% %s" % ('#' * (int(percent * 1 + 0.5) ),
                                      percent, ua))


def main():
    files = sys.argv[1:]
    if {'-h', '--help'}.intersection(files):
        print(__doc__)
        sys.exit()

    names = Counter()
    mbox_counts = []
    for mbox in files:
        counts = count_user_agents(mbox)
        mbox_counts.append(counts)
        names.update(counts)

    names = [x[0] for x in names.most_common()]
    for filename, counts in zip(files, mbox_counts):
        print('----- %s -----' % filename)
        print_user_agents(counts, names)

main()
#  distributed via <nettime>: no commercial use without permission
#  <nettime>  is a moderated mailing list for net criticism,
#  collaborative text filtering and cultural politics of the nets
#  more info: http://mx.kein.org/mailman/listinfo/nettime-l
#  archive: http://www.nettime.org contact: nettime@kein.org
#  @nettime_bot tweets mail w/ sender unless #ANON is in Subject: