#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import print_function from bs4 import BeautifulSoup import argparse import os import sys ################################################################################################### debug = False PY3 = (sys.version_info.major >= 3) scriptName = os.path.basename(__file__) scriptPath = os.path.dirname(os.path.realpath(__file__)) origPath = os.getcwd() ################################################################################################### if not PY3: if hasattr(__builtins__, 'raw_input'): input = raw_input try: FileNotFoundError except NameError: FileNotFoundError = IOError ################################################################################################### # print to stderr def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) ################################################################################################### # convenient boolean argument parsing def str2bool(v): if v.lower() in ('yes', 'true', 't', 'y', '1'): return True elif v.lower() in ('no', 'false', 'f', 'n', '0'): return False else: raise argparse.ArgumentTypeError('Boolean value expected.') ################################################################################################### # main def main(): global debug parser = argparse.ArgumentParser(description=scriptName, add_help=False, usage='{} '.format(scriptName)) parser.add_argument('-v', '--verbose', dest='debug', type=str2bool, nargs='?', const=True, default=False, help="Verbose output") parser.add_argument('-i', '--input', required=True, metavar='', type=str, help='Input file') parser.add_argument('-o', '--output', required=True, metavar='', type=str, help='Output file') parser.add_argument('-c', '--div-class', required=True, dest='divClass', metavar='', type=str, default='', help='div class to remove') parser.add_argument('-p', '--parser', required=False, dest='parser', metavar='', type=str, default='html.parser', help='BeautifulSoup parser') parser.add_argument('-e', '--encoding', required=False, dest='encoding', metavar='', type=str, default='utf-8', help='Encoding for output file') try: parser.error = parser.exit args = parser.parse_args() except SystemExit: parser.print_help() exit(2) debug = args.debug if debug: eprint(os.path.join(scriptPath, scriptName)) eprint("Arguments: {}".format(sys.argv[1:])) eprint("Arguments: {}".format(args)) else: sys.tracebacklimit = 0 soup = BeautifulSoup(open(args.input), args.parser) for div in soup.find_all("div", { 'class' : args.divClass }): div.decompose() with open(args.output, 'wb') as f: f.write(soup.prettify(args.encoding)) if __name__ == '__main__': main()