DetectionLab/Vagrant/resources/malcolm/filebeat/scripts/zeek-log-field-bitmap.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright (c) 2021 Battelle Energy Alliance, LLC.  All rights reserved.

###################################################################################################
# parse the fields names from the header of of the log file and compare them to the
# known list of total fields. if this zeek log has is a subset of the known fields,
# create a bitmap of the included fields to be included as a special tag
# which can help the logstash parser know on a line-by-line basis which fields are included.
# when logstash-filter-dissect gets this implemented, we may not have to do this:
#   - https://github.com/logstash-plugins/logstash-filter-dissect/issues/56
#   - https://github.com/logstash-plugins/logstash-filter-dissect/issues/62
#
# arguments: accepts one argument, the name of a zeek log file
# output:    returns a string suitable for use as a tag indicating the field bitset., eg., ZEEKFLDx00x01FFFFFF
#
#            ZEEKFLDx00x01FFFFFF
#                   |  └ bitmap of included fields within field list
#                   └ index into zeekLogFields list indicating (to support legacy field configurations, see below)
#
# example:
#            $ ./zeek-log-field-bitmap.py /path/to/conn.log
#            ZEEKFLDx00x01FFFFFF
#
# there are two cases we're trying to cover here by indicating the field types:
#   1. certain fields can be turned on/off in config (for example, enabling/disabling MACs or VLANs for conn.log)
#   2. a Zeek version upgrade changed the field list (see notes about DHCP.log in
#      https://docs.zeek.org/en/latest/install/release-notes.html#bro-2-6)
#
# The first case is pretty simple, because in that case the fields in the zeek log will be some subset of
# the list of all known fields for that type.
#
# The second case is more complicated because the field list could be completely different. Because of this case
# each of the entries in zeekLogFields is itself a list, with older configurations occuring earlier in the list
#
#     $ zeek-log-field-bitmap.py ./bro2.5/dhcp.log
#     ZEEKFLDx00x000003FF
#
#     $ zeek-log-field-bitmap.py ./bro2.6/dhcp.log
#     ZEEKFLDx01x00007FFF
#

import sys
import os
import json
from collections import defaultdict
from ordered_set import OrderedSet

# lists of all known fields for each type of zeek log we're concerned with mapping (ordered as in the .log file header)
# are stored in zeek-log-fields.json
FIELDS_JSON_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "zeek-log-fields.json")

ZEEK_LOG_DELIMITER = '\t'            # zeek log file field delimiter
ZEEK_LOG_HEADER_LOGTYPE = 'path'     # header value for zeek log type (conn, weird, etc.)
ZEEK_LOG_HEADER_FIELDS = 'fields'    # header value for zeek log fields list

# file prefix for bitmap to stdout, eg., ZEEKFLDx00x01FFFFFF
ZEEK_LOG_BITMAP_PREFIX = 'ZEEKFLD'


###################################################################################################
# print to stderr
def eprint(*args, **kwargs):
  print(*args, file=sys.stderr, **kwargs)

###################################################################################################
# Set the index'th bit of v to 1 if x is truthy, else to 0, and return the new value
def set_bit(v, index, x):
  mask = 1 << index   # Compute mask, an integer with just bit 'index' set.
  v &= ~mask          # Clear the bit indicated by the mask (if x is False)
  if x:
    v |= mask         # If x was True, set the bit indicated by the mask.
  return v

###################################################################################################
# main
def main():
  errCode = os.EX_DATAERR


  dataError = False
  zeekLogFields = defaultdict(list)

  # load from json canonical list of known zeek log fields we're concerned with mapping
  zeekLogFieldsTmp = json.load(open(FIELDS_JSON_FILE, 'r'))
  if isinstance(zeekLogFieldsTmp, dict):
    for logType, listOfFieldLists in zeekLogFieldsTmp.items():
      if isinstance(logType, str) and isinstance(listOfFieldLists, list):
        zeekLogFields[str(logType)] = [OrderedSet(fieldList) for fieldList in listOfFieldLists]
      else:
        dataError = True
        break
  else:
    dataError = True


  if dataError:
    # something is wrong with the json file
    eprint("Error loading {} (not found or incorrectly formatted)".format(FIELDS_JSON_FILE))

  else:
    if (len(sys.argv) == 2) and os.path.isfile(sys.argv[1]):

      fieldsBitmap = 0

      # loop over header lines in zeek log file (beginning with '#') and extract the header values
      # into a dictionary containing, among other things:
      #   - the "path" which is the zeek log type (eg., conn, weird, etc.)
      #   - the "fields" list of field names
      headers = {}
      with open(sys.argv[1], "r") as zeekLogFile:
        for line in zeekLogFile:
          if line.startswith('#'):
            values = line.strip().split(ZEEK_LOG_DELIMITER)
            key = values.pop(0)[1:]
            if (len(values) == 1):
              headers[key] = values[0]
            else:
              headers[key] = values
          else:
            break

      if ((ZEEK_LOG_HEADER_LOGTYPE in headers) and                 # the "path" header exists
          (ZEEK_LOG_HEADER_FIELDS in headers) and                  # the "fields" header exists
          (headers[ZEEK_LOG_HEADER_LOGTYPE] in zeekLogFields)):    # this zeek log type is one we're concerned with mapping

        # the set of field names in *this* log file
        logFieldNames = OrderedSet(headers[ZEEK_LOG_HEADER_FIELDS])

        for versionIdx, allFieldNames in reversed(list(enumerate(zeekLogFields[headers[ZEEK_LOG_HEADER_LOGTYPE]]))):

          # are this logfile's fields a subset of the complete list?
          if logFieldNames.issubset(allFieldNames):

            # determine which fields in the complete list are included in this log file
            for i, fName in enumerate(allFieldNames):
              fieldsBitmap = set_bit(fieldsBitmap, i, fName in logFieldNames)

            # eprint(fieldsBitmap)
            print('{0}x{1:02X}x{2:08X}'.format(ZEEK_LOG_BITMAP_PREFIX, versionIdx, fieldsBitmap))
            errCode = os.EX_OK

    else:
      # invalid command-line arguments
      eprint("{} <Zeek log file>".format(sys.argv[0]))
      errCode = os.EX_USAGE

  return errCode

if __name__ == '__main__':
  sys.exit(main())