#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Copyright (c) 2021 Battelle Energy Alliance, LLC. All rights reserved. import clamd import hashlib import json import malass_client import os import re import requests import sys import time import yara import zmq from abc import ABC, abstractmethod from bs4 import BeautifulSoup from collections import Counter from collections import deque from collections import defaultdict from datetime import datetime from multiprocessing import RawValue from subprocess import (PIPE, Popen) from threading import get_ident from threading import Lock ################################################################################################### VENTILATOR_PORT = 5987 SINK_PORT = 5988 TOPIC_FILE_SCAN = "file" ################################################################################################### # modes for file preservation settings PRESERVE_QUARANTINED = "quarantined" PRESERVE_ALL = "all" PRESERVE_NONE = "none" PRESERVE_QUARANTINED_DIR_NAME = "quarantine" PRESERVE_PRESERVED_DIR_NAME = "preserved" ################################################################################################### FILE_SCAN_RESULT_SCANNER = "scanner" FILE_SCAN_RESULT_FILE = "file" FILE_SCAN_RESULT_FILE_SIZE = "size" FILE_SCAN_RESULT_FILE_TYPE = "type" FILE_SCAN_RESULT_ENGINES = "engines" FILE_SCAN_RESULT_HITS = "hits" FILE_SCAN_RESULT_MESSAGE = "message" FILE_SCAN_RESULT_DESCRIPTION = "description" ################################################################################################### # the notice field for the signature.log we're writing out mimicing Zeek ZEEK_SIGNATURE_NOTICE = "Signatures::Sensitive_Signature" ################################################################################################### # VirusTotal public API VTOT_MAX_REQS = 4 # maximum 4 public API requests (default) VTOT_MAX_SEC = 60 # in 60 seconds (default) VTOT_CHECK_INTERVAL = 0.05 VTOT_URL = 'https://www.virustotal.com/vtapi/v2/file/report' VTOT_RESP_NOT_FOUND = 0 VTOT_RESP_FOUND = 1 VTOT_RESP_QUEUED = -2 ################################################################################################### # Malass web API MAL_MAX_REQS = 20 # maximum scanning requests concurrently MAL_END_OF_TRANSACTION = 'End_of_Transaction' MAL_SUBMIT_TIMEOUT_SEC = 60 MAL_CHECK_INTERVAL = 1 MAL_RESP_NOT_FOUND = 0 MAL_RESP_FOUND = 1 MAL_RESP_QUEUED = -2 ################################################################################################### # ClamAV Interface CLAM_MAX_REQS = 8 # maximum scanning requests concurrently, should be <= clamd.conf MaxThreads CLAM_SUBMIT_TIMEOUT_SEC = 10 CLAM_CHECK_INTERVAL = 0.1 CLAM_ENGINE_ID = 'ClamAV' CLAM_FOUND_KEY = 'FOUND' ################################################################################################### # Yara Interface YARA_RULES_DIR = os.path.join(os.getenv('YARA_RULES_DIR', "/yara-rules"), '') YARA_CUSTOM_RULES_DIR = os.path.join(YARA_RULES_DIR, "custom") YARA_SUBMIT_TIMEOUT_SEC = 60 YARA_ENGINE_ID = 'Yara' YARA_MAX_REQS = 8 # maximum scanning threads concurrently YARA_CHECK_INTERVAL = 0.1 YARA_RUN_TIMEOUT_SEC = 300 ################################################################################################### # Capa CAPA_MAX_REQS = 4 # maximum scanning threads concurrently CAPA_SUBMIT_TIMEOUT_SEC = 60 CAPA_ENGINE_ID = 'Capa' CAPA_CHECK_INTERVAL = 0.1 CAPA_MIMES_TO_SCAN = ('application/bat', 'application/ecmascript', 'application/javascript', 'application/PowerShell', 'application/vnd.microsoft.portable-executable', 'application/x-bat', 'application/x-dosexec', 'application/x-executable', 'application/x-msdos-program', 'application/x-msdownload', 'application/x-pe-app-32bit-i386', 'application/x-sh', 'text/jscript', 'text/vbscript', 'text/x-python', 'text/x-shellscript') CAPA_VIV_SUFFIX = '.viv' CAPA_VIV_MIME = 'data' CAPA_ATTACK_KEY = 'att&ck' CAPA_RUN_TIMEOUT_SEC = 300 ################################################################################################### # a structure representing the fields of a line of Zeek's signatures.log, and the corresponding string formatting and type definitions class BroSignatureLine: __slots__ = ('ts', 'uid', 'orig_h', 'orig_p', 'resp_h', 'resp_p', 'note', 'signature_id', 'event_message', 'sub_message', 'signature_count', 'host_count') def __init__(self, ts='-', uid='-', orig_h='-', orig_p='-', resp_h='-', resp_p='-', note='-', signature_id='-', event_message='-', sub_message='-', signature_count='-', host_count='-'): self.ts = ts self.uid = uid self.orig_h = orig_h self.orig_p = orig_p self.resp_h = resp_h self.resp_p = resp_p self.note = note self.signature_id = signature_id self.event_message = event_message self.sub_message = sub_message self.signature_count = signature_count self.host_count = host_count def __str__(self): return "\t".join(map(str, [self.ts, self.uid, self.orig_h, self.orig_p, self.resp_h, self.resp_p, self.note, self.signature_id, self.event_message, self.sub_message, self.signature_count, self.host_count])) @classmethod def signature_format_line(cls): return "\t".join(['{'+x+'}' for x in cls.__slots__]) @classmethod def signature_types_line(cls): return "\t".join(['time', 'string', 'addr', 'port', 'addr', 'port', 'enum', 'string', 'string', 'string', 'count', 'count']) # AnalyzerScan # .provider - a FileScanProvider subclass doing the scan/lookup # .name - the filename to be scanned # .size - the size (in bytes) of the file # .fileType - the file's mime type # .submissionResponse - a unique identifier to be returned by the provider with which to check status class AnalyzerScan: __slots__ = ('provider', 'name', 'size', 'fileType', 'submissionResponse') def __init__(self, provider=None, name=None, size=None, fileType=None, submissionResponse=None): self.provider = provider self.name = name self.size = size self.fileType = fileType self.submissionResponse = submissionResponse # AnalyzerResult # .finished - the scan/lookup is no longer executing (whether or not it was successful or returned a "match") # .success - requesting the status was done successfully (whether or not it was finished) # .result - the "result" of the scan/lookup, in whatever format is native to the provider class AnalyzerResult: __slots__ = ('finished', 'success', 'verbose', 'result') def __init__(self, finished=False, success=False, verbose=False, result=None): self.finished = finished self.success = success self.verbose = verbose self.result = result # the filename parts used by our Zeek instance for extracted files: # source-fuid-uid-time.ext, eg., SSL-FTnzwn4hEPJi7BfzRk-CsRaviydrGyYROuX3-20190402105425.crt class ExtractedFileNameParts: __slots__ = ('source', 'fid', 'uid', 'time', 'ext') def __init__(self, source=None, fid=None, uid=None, time=None, ext=None): self.source = source self.fid = fid self.uid = uid self.time = time self.ext = ext ################################################################################################### # convenient boolean argument parsing def str2bool(v): if v.lower() in ('yes', 'true', 't', 'y', '1'): return True elif v.lower() in ('no', 'false', 'f', 'n', '0'): return False else: raise argparse.ArgumentTypeError('Boolean value expected.') ################################################################################################### # print to stderr def eprint(*args, **kwargs): print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), *args, file=sys.stderr, **kwargs) ################################################################################################### # calculate a sha256 hash of a file def sha256sum(filename): h = hashlib.sha256() b = bytearray(64 * 1024) mv = memoryview(b) with open(filename, 'rb', buffering=0) as f: for n in iter(lambda : f.readinto(mv), 0): h.update(mv[:n]) return h.hexdigest() ################################################################################################### # recursive dictionary key search def dictsearch(d, target): val = filter(None, [[b] if a == target else dictsearch(b, target) if isinstance(b, dict) else None for a, b in d.items()]) return [i for b in val for i in b] ################################################################################################### # filespec to various fields as per the extractor zeek script (/opt/zeek/share/zeek/site/extractor.zeek) # source-fuid-uid-time.ext # eg. # SSL-FTnzwn4hEPJi7BfzRk-CsRaviydrGyYROuX3-20190402105425.crt # # there are other extracted files that come from the mitre-attack/bzar scripts, they are formatted like this: # local fname = fmt("%s_%s%s", c$uid, f$id, subst_string(smb_name, "\\", "_")); # # CR7X4q2hmcXKqP0vVj_F3jZ2VjYttqhKaGfh__172.16.1.8_C$_WINDOWS_sny4u_un1zbd94ytwj99hcymmsad7j54gr4wdskwnqs0ki252jdsrf763zsm531b.exe # └----------------┘ └---------------┘└------------------------------------------------------------------------------------------┘ # UID FID subst_string(smb_name, "\\", "_")) # # (see https://github.com/mitre-attack/bzar/blob/master/scripts/bzar_files.zeek#L50) def extracted_filespec_to_fields(filespec): baseFileSpec = os.path.basename(filespec) match = re.search(r'^(?P.*)-(?P.*)-(?P.*)-(?P