323 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			323 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env python3
 | |
| # -*- coding: utf-8 -*-
 | |
| 
 | |
| # Copyright (c) 2021 Battelle Energy Alliance, LLC.  All rights reserved.
 | |
| 
 | |
| ###################################################################################################
 | |
| # Monitor a directory for PCAP files for processing (by publishing their filenames to a ZMQ socket)
 | |
| #
 | |
| # Run the script with --help for options
 | |
| ###################################################################################################
 | |
| 
 | |
| import argparse
 | |
| import glob
 | |
| import json
 | |
| import logging
 | |
| import magic
 | |
| import os
 | |
| import pathlib
 | |
| import pyinotify
 | |
| import signal
 | |
| import sys
 | |
| import time
 | |
| import zmq
 | |
| 
 | |
| from pcap_utils import *
 | |
| 
 | |
| import elasticsearch
 | |
| import elasticsearch_dsl
 | |
| 
 | |
| ###################################################################################################
 | |
| MINIMUM_CHECKED_FILE_SIZE_DEFAULT = 24
 | |
| MAXIMUM_CHECKED_FILE_SIZE_DEFAULT = 32*1024*1024*1024
 | |
| 
 | |
| ###################################################################################################
 | |
| # for querying the Arkime's "files" Elasticsearch index to avoid re-processing (duplicating sessions for)
 | |
| # files that have already been processed
 | |
| ARKIME_FILES_INDEX = "files"
 | |
| ARKIME_FILE_TYPE = "file"
 | |
| ARKIME_FILE_SIZE_FIELD = "filesize"
 | |
| 
 | |
| ###################################################################################################
 | |
| debug = False
 | |
| verboseDebug = False
 | |
| pdbFlagged = False
 | |
| args = None
 | |
| scriptName = os.path.basename(__file__)
 | |
| scriptPath = os.path.dirname(os.path.realpath(__file__))
 | |
| origPath = os.getcwd()
 | |
| shuttingDown = False
 | |
| 
 | |
| ###################################################################################################
 | |
| # watch files written to and moved to this directory
 | |
| class EventWatcher(pyinotify.ProcessEvent):
 | |
| 
 | |
|   # notify on files written in-place then closed (IN_CLOSE_WRITE), and moved into this directory (IN_MOVED_TO)
 | |
|   _methods = ["IN_CLOSE_WRITE", "IN_MOVED_TO"]
 | |
| 
 | |
|   def __init__(self):
 | |
|     global args
 | |
|     global debug
 | |
|     global verboseDebug
 | |
| 
 | |
|     super().__init__()
 | |
| 
 | |
|     self.useElastic = False
 | |
| 
 | |
|     # if we're going to be querying Elasticsearch for past PCAP file status, connect now
 | |
|     if args.elasticHost is not None:
 | |
| 
 | |
|       connected = False
 | |
|       healthy = False
 | |
| 
 | |
|       # create the connection to Elasticsearch
 | |
|       while (not connected) and (not shuttingDown):
 | |
|         try:
 | |
|           if debug: eprint(f"{scriptName}:\tconnecting to Elasticsearch {args.elasticHost}...")
 | |
|           elasticsearch_dsl.connections.create_connection(hosts=[args.elasticHost])
 | |
|           if verboseDebug: eprint(f"{scriptName}:\t{elasticsearch_dsl.connections.get_connection().cluster.health()}")
 | |
|           connected = elasticsearch_dsl.connections.get_connection() is not None
 | |
| 
 | |
|         except elasticsearch.exceptions.ConnectionError as connError:
 | |
|           if debug: eprint(f"{scriptName}:\tElasticsearch connection error: {connError}")
 | |
| 
 | |
|         if (not connected) and args.elasticWaitForHealth:
 | |
|           time.sleep(1)
 | |
|         else:
 | |
|           break
 | |
| 
 | |
|       # if requested, wait for at least "yellow" health in the cluster for the "files" index
 | |
|       while connected and args.elasticWaitForHealth and (not healthy) and (not shuttingDown):
 | |
|         try:
 | |
|           if debug: eprint(f"{scriptName}:\twaiting for Elasticsearch to be healthy")
 | |
|           elasticsearch_dsl.connections.get_connection().cluster.health(index=ARKIME_FILES_INDEX, wait_for_status='yellow')
 | |
|           if verboseDebug: eprint(f"{scriptName}:\t{elasticsearch_dsl.connections.get_connection().cluster.health()}")
 | |
|           healthy = True
 | |
| 
 | |
|         except elasticsearch.exceptions.ConnectionTimeout as connError:
 | |
|           if verboseDebug: eprint(f"{scriptName}:\tElasticsearch health check: {connError}")
 | |
| 
 | |
|         if (not healthy):
 | |
|           time.sleep(1)
 | |
| 
 | |
|       self.useElastic = connected and healthy
 | |
| 
 | |
|     # initialize ZeroMQ context and socket(s) to publish messages to
 | |
|     self.context = zmq.Context()
 | |
| 
 | |
|     # Socket to send messages on
 | |
|     if debug: eprint(f"{scriptName}:\tbinding publisher port {PCAP_TOPIC_PORT}")
 | |
|     self.topic_socket = self.context.socket(zmq.PUB)
 | |
|     self.topic_socket.bind(f"tcp://*:{PCAP_TOPIC_PORT}")
 | |
| 
 | |
|     # todo: do I want to set this? probably not since this guy's whole job is to send
 | |
|     # and if he can't then what's the point? just block
 | |
|     # self.topic_socket.SNDTIMEO = 5000
 | |
| 
 | |
|     if debug: eprint(f"{scriptName}:\tEventWatcher initialized")
 | |
| 
 | |
| ###################################################################################################
 | |
| # set up event processor to append processed events from to the event queue
 | |
| def event_process_generator(cls, method):
 | |
| 
 | |
|   # actual method called when we are notified of a file
 | |
|   def _method_name(self, event):
 | |
| 
 | |
|     global args
 | |
|     global debug
 | |
|     global verboseDebug
 | |
| 
 | |
|     if debug: eprint(f"{scriptName}:\t👓\t{event.pathname}")
 | |
| 
 | |
|     # the entity must be a regular PCAP file and actually exist
 | |
|     if (not event.dir) and os.path.isfile(event.pathname):
 | |
| 
 | |
|       # get the file magic description and mime type
 | |
|       fileMime = magic.from_file(event.pathname, mime=True)
 | |
|       fileType = magic.from_file(event.pathname)
 | |
| 
 | |
|       # get the file size, in bytes to compare against sane values
 | |
|       fileSize = os.path.getsize(event.pathname)
 | |
|       if (args.minBytes <= fileSize <= args.maxBytes) and ((fileMime in PCAP_MIME_TYPES) or ('pcap-ng' in fileType)):
 | |
| 
 | |
|         relativePath = remove_prefix(event.pathname, os.path.join(args.baseDir, ''))
 | |
| 
 | |
|         # check with Arkime's files index in Elasticsearch and make sure it's not a duplicate
 | |
|         fileIsDuplicate = False
 | |
|         if self.useElastic:
 | |
|           s = elasticsearch_dsl.Search(index=ARKIME_FILES_INDEX) \
 | |
|               .filter("term", _type=ARKIME_FILE_TYPE) \
 | |
|               .filter("term", node=args.molochNode) \
 | |
|               .query("wildcard", name=f"*{os.path.sep}{relativePath}")
 | |
|           response = s.execute()
 | |
|           for hit in response:
 | |
|             fileInfo = hit.to_dict()
 | |
|             if (ARKIME_FILE_SIZE_FIELD in fileInfo) and (fileInfo[ARKIME_FILE_SIZE_FIELD] == fileSize):
 | |
|               fileIsDuplicate = True
 | |
|               break
 | |
| 
 | |
|         if fileIsDuplicate:
 | |
|           # this is duplicate file (it's been processed before) so ignore it
 | |
|           if debug: eprint(f"{scriptName}:\t📋\t{event.pathname}")
 | |
| 
 | |
|         else:
 | |
|           # the entity is a right-sized non-duplicate file, and it exists, so send it to get processed
 | |
|           if debug: eprint(f"{scriptName}:\t📩\t{event.pathname}")
 | |
|           try:
 | |
|             fileInfo = {FILE_INFO_DICT_NAME: event.pathname if args.includeAbsolutePath else relativePath, \
 | |
|                         FILE_INFO_DICT_SIZE: fileSize, \
 | |
|                         FILE_INFO_FILE_MIME: fileMime, \
 | |
|                         FILE_INFO_FILE_TYPE: fileType, \
 | |
|                         FILE_INFO_DICT_TAGS: tags_from_filename(relativePath)}
 | |
|             self.topic_socket.send_string(json.dumps(fileInfo))
 | |
|             if debug: eprint(f"{scriptName}:\t📫\t{fileInfo}")
 | |
|           except zmq.Again as timeout:
 | |
|             if verboseDebug: eprint(f"{scriptName}:\t🕑\t{event.pathname}")
 | |
| 
 | |
|       else:
 | |
|         # too small/big to care about, or the wrong type, ignore it
 | |
|         if debug: eprint(f"{scriptName}:\t✋\t{event.pathname}")
 | |
| 
 | |
|   # assign process method to class
 | |
|   _method_name.__name__ = "process_{}".format(method)
 | |
|   setattr(cls, _method_name.__name__, _method_name)
 | |
| 
 | |
| ###################################################################################################
 | |
| # handle sigint/sigterm and set a global shutdown variable
 | |
| def shutdown_handler(signum, frame):
 | |
|   global shuttingDown
 | |
|   shuttingDown = True
 | |
| 
 | |
| ###################################################################################################
 | |
| # handle sigusr1 for a pdb breakpoint
 | |
| def pdb_handler(sig, frame):
 | |
|   global pdbFlagged
 | |
|   pdbFlagged = True
 | |
| 
 | |
| ###################################################################################################
 | |
| # handle sigusr2 for toggling debug
 | |
| def debug_toggle_handler(signum, frame):
 | |
|   global debug
 | |
|   global debugToggled
 | |
|   debug = not debug
 | |
|   debugToggled = True
 | |
| 
 | |
| ###################################################################################################
 | |
| # main
 | |
| def main():
 | |
|   global args
 | |
|   global debug
 | |
|   global verboseDebug
 | |
|   global debugToggled
 | |
|   global pdbFlagged
 | |
|   global shuttingDown
 | |
| 
 | |
|   parser = argparse.ArgumentParser(description=scriptName, add_help=False, usage='{} <arguments>'.format(scriptName))
 | |
|   parser.add_argument('-v', '--verbose', dest='debug', help="Verbose output", metavar='true|false', type=str2bool, nargs='?', const=True, default=False, required=False)
 | |
|   parser.add_argument('--extra-verbose', dest='verboseDebug', help="Super verbose output", metavar='true|false', type=str2bool, nargs='?', const=True, default=False, required=False)
 | |
| 
 | |
|   parser.add_argument('--min-bytes', dest='minBytes', help="Minimum size for checked files", metavar='<bytes>', type=int, default=MINIMUM_CHECKED_FILE_SIZE_DEFAULT, required=False)
 | |
|   parser.add_argument('--max-bytes', dest='maxBytes', help="Maximum size for checked files", metavar='<bytes>', type=int, default=MAXIMUM_CHECKED_FILE_SIZE_DEFAULT, required=False)
 | |
|   parser.add_argument('--elasticsearch', required=False, dest='elasticHost', metavar='<STR>', type=str, default=None, help='Elasticsearch connection string for querying Arkime files index to ignore duplicates')
 | |
|   parser.add_argument('--elasticsearch-wait', dest='elasticWaitForHealth', help="Wait for Elasticsearch to be healthy before starting", metavar='true|false', type=str2bool, nargs='?', const=True, default=False, required=False)
 | |
|   parser.add_argument('--moloch-node', required=False, dest='molochNode', metavar='<STR>', type=str, default='arkime', help='Arkime node value for querying Arkime files index to ignore duplicates')
 | |
| 
 | |
|   parser.add_argument('--ignore-existing', dest='ignoreExisting', help="Ignore preexisting files in the monitor directory", metavar='true|false', type=str2bool, nargs='?', const=True, default=False, required=False)
 | |
|   parser.add_argument('--absolute-path', dest='includeAbsolutePath', help="Publish absolute path for message (vs. path relative to monitored directory)", metavar='true|false', type=str2bool, nargs='?', const=True, default=False, required=False)
 | |
|   parser.add_argument('--start-sleep', dest='startSleepSec', help="Sleep for this many seconds before starting", metavar='<seconds>', type=int, default=0, required=False)
 | |
|   parser.add_argument('-r', '--recursive-directory', dest='recursiveDir', help="If specified, monitor all directories with this name underneath --directory", metavar='<name>', type=str, required=False)
 | |
|   requiredNamed = parser.add_argument_group('required arguments')
 | |
|   requiredNamed.add_argument('-d', '--directory', dest='baseDir', help='Directory to monitor', metavar='<directory>', type=str, required=True)
 | |
| 
 | |
|   try:
 | |
|     parser.error = parser.exit
 | |
|     args = parser.parse_args()
 | |
|   except SystemExit:
 | |
|     parser.print_help()
 | |
|     exit(2)
 | |
| 
 | |
|   verboseDebug = args.verboseDebug
 | |
|   debug = args.debug or verboseDebug
 | |
|   if debug:
 | |
|     eprint(os.path.join(scriptPath, scriptName))
 | |
|     eprint("{} arguments: {}".format(scriptName, sys.argv[1:]))
 | |
|     eprint("{} arguments: {}".format(scriptName, args))
 | |
|   else:
 | |
|     sys.tracebacklimit = 0
 | |
| 
 | |
|   logging.basicConfig(level=logging.ERROR)
 | |
| 
 | |
|   # handle sigint and sigterm for graceful shutdown
 | |
|   signal.signal(signal.SIGINT, shutdown_handler)
 | |
|   signal.signal(signal.SIGTERM, shutdown_handler)
 | |
|   signal.signal(signal.SIGUSR1, pdb_handler)
 | |
|   signal.signal(signal.SIGUSR2, debug_toggle_handler)
 | |
| 
 | |
|   # sleep for a bit if requested
 | |
|   sleepCount = 0
 | |
|   while (not shuttingDown) and (sleepCount < args.startSleepSec):
 | |
|     time.sleep(1)
 | |
|     sleepCount += 1
 | |
| 
 | |
|   # add events to watch to EventWatcher class
 | |
|   for method in EventWatcher._methods:
 | |
|     event_process_generator(EventWatcher, method)
 | |
| 
 | |
|   # if directory to monitor doesn't exist, create it now
 | |
|   if os.path.isdir(args.baseDir):
 | |
|     preexistingDir = True
 | |
|   else:
 | |
|     preexistingDir = False
 | |
|     if debug: eprint(f'{scriptname}: creating "{args.baseDir}" to monitor')
 | |
|     pathlib.Path(args.baseDir).mkdir(parents=False, exist_ok=True)
 | |
| 
 | |
|   # if recursion was requested, get list of directories to monitor
 | |
|   watchDirs = []
 | |
|   while (len(watchDirs) == 0):
 | |
|     if args.recursiveDir is None:
 | |
|       watchDirs = [args.baseDir]
 | |
|     else:
 | |
|       watchDirs = glob.glob(f'{args.baseDir}/**/{args.recursiveDir}', recursive=True)
 | |
| 
 | |
|   # begin threaded watch of path(s)
 | |
|   time.sleep(1)
 | |
| 
 | |
|   event_notifier_started = False
 | |
|   watch_manager = pyinotify.WatchManager()
 | |
|   event_notifier = pyinotify.ThreadedNotifier(watch_manager, EventWatcher())
 | |
|   for watchDir in watchDirs:
 | |
|     watch_manager.add_watch(os.path.abspath(watchDir), pyinotify.ALL_EVENTS)
 | |
|   if debug: eprint(f"{scriptName}: monitoring {watchDirs}")
 | |
|   time.sleep(2)
 | |
|   if (not shuttingDown):
 | |
|     event_notifier.start()
 | |
|     event_notifier_started = True
 | |
| 
 | |
|   # if there are any previously included files (and not ignoreExisting), "touch" them so that they will be notified on
 | |
|   if preexistingDir and (not args.ignoreExisting) and (not shuttingDown):
 | |
|     filesTouched = 0
 | |
|     for watchDir in watchDirs:
 | |
|       for preexistingFile in [os.path.join(watchDir, x) for x in pathlib.Path(watchDir).iterdir() if x.is_file()]:
 | |
|         touch(preexistingFile)
 | |
|         filesTouched += 1
 | |
|     if debug and (filesTouched > 0):
 | |
|       eprint(f"{scriptName}: found {filesTouched} preexisting files to check")
 | |
| 
 | |
|   # loop forever, or until we're told to shut down, whichever comes first
 | |
|   while (not shuttingDown):
 | |
|     if pdbFlagged:
 | |
|       pdbFlagged = False
 | |
|       breakpoint()
 | |
|     time.sleep(0.2)
 | |
| 
 | |
|   # graceful shutdown
 | |
|   if debug: eprint(f"{scriptName}: shutting down...")
 | |
|   if event_notifier_started:
 | |
|     event_notifier.stop()
 | |
|   time.sleep(1)
 | |
| 
 | |
|   if debug: eprint(f"{scriptName}: finished monitoring {watchDirs}")
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|   main()
 |