Source code for apachelogs.util

from   datetime import date, datetime, timedelta, timezone
import re

#: The names of the months in English
MONTH_FULL_NAMES = [
    'January',
    'February',
    'March',
    'April',
    'May',
    'June',
    'July',
    'August',
    'September',
    'October',
    'November',
    'December',
]

#: The abbreviated names of the months in English
MONTH_SNAMES = [
    'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
    'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec',
]

#: Compiled regex for an Apache timestamp
APACHE_TS_RGX = re.compile(r'''
    ^\[?
    (?P<day>\d\d)   / (?P<month>\w\w\w) / (?P<year>\d{4,})
    :(?P<hour>\d\d) : (?P<minute>\d\d)  : (?P<second>\d\d)
    \s* (?P<tzoffset_sign>[-+]) (?P<tzoffset_hour>\d\d) (?P<tzoffset_min>\d\d)
    \]?$
''', flags=re.X)

[docs]def parse_apache_timestamp(s): """ Parse an Apache timestamp into a `datetime.datetime` object. The month name in the timestamp is expected to be an abbreviated English name regardless of the current locale. >>> parse_apache_timestamp('[01/Nov/2017:07:28:29 +0000]') datetime.datetime(2017, 11, 1, 7, 28, 29, tzinfo=datetime.timezone.utc) :param str s: a string of the form ``DD/Mon/YYYY:HH:MM:SS +HHMM`` (optionally enclosed in square brackets) :return: an aware `datetime.datetime` :raises ValueError: if ``s`` is not in the expected format """ # Apache timestamps always use English month abbreviations. Thus, parsing # with strptime like the below will fail when in a locale with different # month snames: #return datetime.strptime(s.strip('[]'), '%d/%b/%Y:%H:%M:%S %z') if s is None: return None m = APACHE_TS_RGX.match(s) if not m: raise ValueError(s) data = m.groupdict() for k in 'year day hour minute second'.split(): data[k] = int(data[k]) try: data['month'] = MONTH_SNAMES.index(data['month']) + 1 except ValueError: raise ValueError(s) tzoffset = timedelta( hours = int(data.pop('tzoffset_hour')), minutes = int(data.pop('tzoffset_min')), ) if data.pop('tzoffset_sign') == '-': tzoffset *= -1 data['tzinfo'] = timezone(tzoffset) return datetime(**data)
def unescape(s): """ Unescape the escape sequences in the string ``s``, returning a `bytes` string """ # Escape sequences used by Apache: \b \n \r \t \v \\ \" \xHH # cf. ap_escape_logitem() in server/util.c return re.sub(r'\\(x[0-9A-Fa-f]{2}|.)', _unesc, s).encode('iso-8859-1') _unescapes = { 't': '\t', 'n': '\n', 'r': '\r', 'b': '\b', 'v': '\v', # Not emitted by Apache (as of v2.4), but other servers might use it: 'f': '\f', } def _unesc(m): esc = m.group(1) if esc[0] == 'x': return chr(int(esc[1:], 16)) else: return _unescapes.get(esc, esc) def assemble_datetime(fields): """ Given a `dict` of time fields, return a `datetime.datetime` object if there is enough information to create one, `None` otherwise. """ if fields.get("timestamp") is not None: return fields["timestamp"] elif fields.get("microepoch") is not None: return datetime.fromtimestamp( fields["microepoch"] / 1000000, fields.get("timezone") or timezone.utc, # fields["timezone"] may be None, in which case we still want the # timezone to be UTC ) elif fields.get("milliepoch") is not None: return datetime.fromtimestamp( fields["milliepoch"] / 1000, fields.get("timezone") or timezone.utc, # fields["timezone"] may be None, in which case we still want the # timezone to be UTC ) elif fields.get("epoch") is not None: return datetime.fromtimestamp( fields["epoch"], fields.get("timezone") or timezone.utc, # fields["timezone"] may be None, in which case we still want the # timezone to be UTC ) else: if fields.get("year") is not None: year = fields["year"] elif fields.get("date") is not None: year = fields["date"].year elif fields.get("century") is not None \ and fields.get("abbrev_year") is not None: year = fields["century"] * 100 + fields["abbrev_year"] else: return None if fields.get("mon") is not None: month = fields["mon"] elif fields.get("date") is not None: month = fields["date"].month elif fields.get("yday") is not None: month = (date(year, 1, 1) + timedelta(days=fields["yday"]-1)).month elif fields.get("full_mon") in MONTH_FULL_NAMES: month = MONTH_FULL_NAMES.index(fields["full_mon"]) + 1 elif fields.get("abbrev_mon") in MONTH_SNAMES: month = MONTH_SNAMES.index(fields["abbrev_mon"]) + 1 else: return None if fields.get("mday") is not None: day = fields["mday"] elif fields.get("date") is not None: day = fields["date"].day elif fields.get("yday") is not None: day = (date(year, 1, 1) + timedelta(days=fields["yday"]-1)).day else: return None if fields.get("hour") is not None: hour = fields["hour"] elif fields.get("time") is not None: hour = fields["time"].hour elif fields.get("hour_min") is not None: hour = fields["hour_min"].hour elif fields.get("hour12") is not None \ and fields.get("am_pm") is not None \ and fields["am_pm"].upper() in ('AM', 'PM'): hour = fields["hour12"] % 12 if fields["am_pm"].upper() == "PM": hour += 12 else: return None if fields.get("min") is not None: minute = fields["min"] elif fields.get("time") is not None: minute = fields["time"].minute elif fields.get("hour_min") is not None: minute = fields["hour_min"].minute else: return None if fields.get("sec") is not None: second = fields["sec"] elif fields.get("time") is not None: second = fields["time"].second else: return None if fields.get("usec_frac") is not None: microsecond = fields["usec_frac"] elif fields.get("msec_frac") is not None: microsecond = fields["msec_frac"] * 1000 else: microsecond = 0 return datetime( year = year, month = month, day = day, hour = hour, minute = minute, second = second, microsecond = microsecond, tzinfo = fields.get("timezone"), )