#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Copyright (c) 2021 Battelle Energy Alliance, LLC. All rights reserved. ################################################################################################### # parse the fields names from the header of of the log file and compare them to the # known list of total fields. if this zeek log has is a subset of the known fields, # create a bitmap of the included fields to be included as a special tag # which can help the logstash parser know on a line-by-line basis which fields are included. # when logstash-filter-dissect gets this implemented, we may not have to do this: # - https://github.com/logstash-plugins/logstash-filter-dissect/issues/56 # - https://github.com/logstash-plugins/logstash-filter-dissect/issues/62 # # arguments: accepts one argument, the name of a zeek log file # output: returns a string suitable for use as a tag indicating the field bitset., eg., ZEEKFLDx00x01FFFFFF # # ZEEKFLDx00x01FFFFFF # | └ bitmap of included fields within field list # └ index into zeekLogFields list indicating (to support legacy field configurations, see below) # # example: # $ ./zeek-log-field-bitmap.py /path/to/conn.log # ZEEKFLDx00x01FFFFFF # # there are two cases we're trying to cover here by indicating the field types: # 1. certain fields can be turned on/off in config (for example, enabling/disabling MACs or VLANs for conn.log) # 2. a Zeek version upgrade changed the field list (see notes about DHCP.log in # https://docs.zeek.org/en/latest/install/release-notes.html#bro-2-6) # # The first case is pretty simple, because in that case the fields in the zeek log will be some subset of # the list of all known fields for that type. # # The second case is more complicated because the field list could be completely different. Because of this case # each of the entries in zeekLogFields is itself a list, with older configurations occuring earlier in the list # # $ zeek-log-field-bitmap.py ./bro2.5/dhcp.log # ZEEKFLDx00x000003FF # # $ zeek-log-field-bitmap.py ./bro2.6/dhcp.log # ZEEKFLDx01x00007FFF # import sys import os import json from collections import defaultdict from ordered_set import OrderedSet # lists of all known fields for each type of zeek log we're concerned with mapping (ordered as in the .log file header) # are stored in zeek-log-fields.json FIELDS_JSON_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "zeek-log-fields.json") ZEEK_LOG_DELIMITER = '\t' # zeek log file field delimiter ZEEK_LOG_HEADER_LOGTYPE = 'path' # header value for zeek log type (conn, weird, etc.) ZEEK_LOG_HEADER_FIELDS = 'fields' # header value for zeek log fields list # file prefix for bitmap to stdout, eg., ZEEKFLDx00x01FFFFFF ZEEK_LOG_BITMAP_PREFIX = 'ZEEKFLD' ################################################################################################### # print to stderr def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) ################################################################################################### # Set the index'th bit of v to 1 if x is truthy, else to 0, and return the new value def set_bit(v, index, x): mask = 1 << index # Compute mask, an integer with just bit 'index' set. v &= ~mask # Clear the bit indicated by the mask (if x is False) if x: v |= mask # If x was True, set the bit indicated by the mask. return v ################################################################################################### # main def main(): errCode = os.EX_DATAERR dataError = False zeekLogFields = defaultdict(list) # load from json canonical list of known zeek log fields we're concerned with mapping zeekLogFieldsTmp = json.load(open(FIELDS_JSON_FILE, 'r')) if isinstance(zeekLogFieldsTmp, dict): for logType, listOfFieldLists in zeekLogFieldsTmp.items(): if isinstance(logType, str) and isinstance(listOfFieldLists, list): zeekLogFields[str(logType)] = [OrderedSet(fieldList) for fieldList in listOfFieldLists] else: dataError = True break else: dataError = True if dataError: # something is wrong with the json file eprint("Error loading {} (not found or incorrectly formatted)".format(FIELDS_JSON_FILE)) else: if (len(sys.argv) == 2) and os.path.isfile(sys.argv[1]): fieldsBitmap = 0 # loop over header lines in zeek log file (beginning with '#') and extract the header values # into a dictionary containing, among other things: # - the "path" which is the zeek log type (eg., conn, weird, etc.) # - the "fields" list of field names headers = {} with open(sys.argv[1], "r") as zeekLogFile: for line in zeekLogFile: if line.startswith('#'): values = line.strip().split(ZEEK_LOG_DELIMITER) key = values.pop(0)[1:] if (len(values) == 1): headers[key] = values[0] else: headers[key] = values else: break if ((ZEEK_LOG_HEADER_LOGTYPE in headers) and # the "path" header exists (ZEEK_LOG_HEADER_FIELDS in headers) and # the "fields" header exists (headers[ZEEK_LOG_HEADER_LOGTYPE] in zeekLogFields)): # this zeek log type is one we're concerned with mapping # the set of field names in *this* log file logFieldNames = OrderedSet(headers[ZEEK_LOG_HEADER_FIELDS]) for versionIdx, allFieldNames in reversed(list(enumerate(zeekLogFields[headers[ZEEK_LOG_HEADER_LOGTYPE]]))): # are this logfile's fields a subset of the complete list? if logFieldNames.issubset(allFieldNames): # determine which fields in the complete list are included in this log file for i, fName in enumerate(allFieldNames): fieldsBitmap = set_bit(fieldsBitmap, i, fName in logFieldNames) # eprint(fieldsBitmap) print('{0}x{1:02X}x{2:08X}'.format(ZEEK_LOG_BITMAP_PREFIX, versionIdx, fieldsBitmap)) errCode = os.EX_OK else: # invalid command-line arguments eprint("{} ".format(sys.argv[0])) errCode = os.EX_USAGE return errCode if __name__ == '__main__': sys.exit(main())