Imported Upstream version 1.2.1 upstream/1.2.1
authorDevon Kearns <dookie@kali.org>
Wed, 3 Jul 2013 16:28:20 +0000 (10:28 -0600)
committerDevon Kearns <dookie@kali.org>
Wed, 3 Jul 2013 16:28:20 +0000 (10:28 -0600)
38 files changed:
bin/binwalk
binwalk/__init__.py
binwalk/common.py
binwalk/config.py
binwalk/config/extract.conf
binwalk/entropy.py [new file with mode: 0644]
binwalk/extractor.py
binwalk/filter.py
binwalk/magic/binarch
binwalk/magic/bincast
binwalk/magic/binwalk
binwalk/parser.py
binwalk/plugins.py [new file with mode: 0644]
binwalk/plugins/armopcodes.py [new file with mode: 0644]
binwalk/plugins/cpio.py [new file with mode: 0644]
binwalk/plugins/strcompat.py [new file with mode: 0644]
binwalk/prettyprint.py
binwalk/smartsig.py [deleted file]
binwalk/smartsignature.py [new file with mode: 0644]
binwalk/smartstrings.py [new file with mode: 0644]
binwalk/update.py
debian_quick_install.sh [new file with mode: 0755]
docs/API
docs/COPYING [deleted file]
docs/LICENSE [new file with mode: 0644]
docs/README
magic/archives
magic/bootloaders
magic/compressed
magic/crypto
magic/executables
magic/filesystems
magic/firmware
magic/images
magic/kernels
magic/lzma
setup.py
support/lzma_gen.py

index 8622cdf..ac145ab 100755 (executable)
@@ -4,44 +4,116 @@ import sys
 import os.path
 import binwalk
 from threading import Thread
-from binwalk.common import str2int
 from getopt import GetoptError, getopt as GetOpt
 
-def display_status(bwalk):
+def display_status():
+       global bwalk
+
        while True:
                # Display the current scan progress when the enter key is pressed.
-               raw_input()
-               print "Progress: %.2f%% (%d / %d)\n" % (((float(bwalk.total_scanned) / float(bwalk.scan_length)) * 100), bwalk.total_scanned, bwalk.scan_length)
+               try:
+                       raw_input()
+                       print "Progress: %.2f%% (%d / %d)\n" % (((float(bwalk.total_scanned) / float(bwalk.scan_length)) * 100), bwalk.total_scanned, bwalk.scan_length)
+               except Exception, e:
+                       pass
+
+def examples():
+       name = os.path.basename(sys.argv[0])
+
+       print """
+Scanning firmware for file signatures:
+
+\t$ %s firmware.bin
+
+Extracting files from firmware:
+
+\t$ %s -Me firmware.bin
+
+Scanning firmware for executable code:
+
+\t$ %s -A firmware.bin
+
+Performing a firmware strings analysis:
+
+\t$ %s -S firmware.bin
+
+Performing a firmware entropy analysis:
+
+\t$ %s -E firmware.bin
+
+Display identified file signatures on entropy graph:
+
+\t$ %s -EB firmware.bin
+
+See http://code.google.com/p/binwalk/wiki/TableOfContents for more.
+""" % (name, name, name, name, name, name)
+       sys.exit(0)
 
 def usage(fd):
        fd.write("\n")
+       
        fd.write("Binwalk v%s\n" % binwalk.Config.VERSION)
        fd.write("Craig Heffner, http://www.devttys0.com\n")
        fd.write("\n")
+       
        fd.write("Usage: %s [OPTIONS] [FILE1] [FILE2] [FILE3] ...\n" % os.path.basename(sys.argv[0]))
        fd.write("\n")
-       fd.write("\t-o, --offset=<int>            Start scan at this file offset\n")
-       fd.write("\t-l, --length=<int>            Number of bytes to scan\n")
-       fd.write("\t-b, --align=<int>             Set byte alignment [default: 1]\n")
+       
+       fd.write("Signature Analysis:\n")
+       fd.write("\t-B, --binwalk                 Perform a file signature scan (default)\n")
+       fd.write("\t-R, --raw-bytes=<string>      Search for a custom signature\n")
+       fd.write("\t-A, --opcodes                 Scan for executable code signatures\n")
+       fd.write("\t-C, --cast                    Cast file contents as various data types\n")
        fd.write("\t-m, --magic=<file>            Specify an alternate magic file to use\n")
-       fd.write("\t-i, --include=<filter>        Include matches that are normally excluded and that have <filter> in their description\n")
        fd.write("\t-x, --exclude=<filter>        Exclude matches that have <filter> in their description\n")
-       fd.write("\t-y, --search=<filter>         Only search for matches that have <filter> in their description\n")
-       fd.write("\t-g, --grep=<text>             Grep results for the specified text\n")
-       fd.write("\t-R, --raw-bytes=<string>      Search for a sequence of raw bytes instead of using the default magic signatures\n")
-       fd.write("\t-f, --file=<file>             Log results to file\n")
-       fd.write("\t-D, --dd=<type:ext[:cmd]>     Extract entries whose descriptions match <type>, give them file extension <ext>, and execute <cmd>\n")
-       fd.write("\t-e, --extract=[file]          Automatically extract known file types. Load rules from file, if specified.\n")
-       fd.write("\t-r, --rm                      Cleanup extracted files and zero-size files\n")
-       fd.write("\t-d, --delay                   Delay file extraction for files with known footers\n")
-       fd.write("\t-a, --all                     Include all short signatures\n")
+       fd.write("\t-y, --include=<filter>        Only search for matches that have <filter> in their description\n")
        fd.write("\t-I, --show-invalid            Show results marked as invalid\n")
-       fd.write("\t-A, --opcodes                 Scan for executable code\n")
-       fd.write("\t-C, --cast                    Cast file contents as various data types\n")
        fd.write("\t-k, --keep-going              Show all matching results at a given offset, not just the first one\n")
+       fd.write("\t-b, --dumb                    Disable smart signature keywords\n")
+       fd.write("\n")
+
+       fd.write("Strings Analysis:\n")
+       fd.write("\t-S, --strings                 Scan for ASCII strings (may be combined with -B, -R, -A, or -E)\n")
+       fd.write("\t-s, --strlen=<n>              Set the minimum string length to search for (default: 3)\n")
+       fd.write("\n")
+       
+       fd.write("Entropy Analysis:\n")
+       fd.write("\t-E, --entropy                 Plot file entropy (may be combined with -B, -R, -A, or -S)\n")
+       fd.write("\t-K, --block=<int>             Set the block size for entropy analysis\n")
+       fd.write("\t-a, --shannon                 Use the Shannon entropy algorithm\n")
+       fd.write("\t-N, --no-plot                 Do not generate an entropy plot graph\n")
+       fd.write("\t-F, --marker=<offset:name>    Add a marker to the entropy plot graph\n")
+       fd.write("\t-Q, --no-legend               Omit the legend from the entropy plot graph\n")
+       fd.write("\t-J, --save-plot               Save plot as an SVG (implied if multiple files are specified)\n")
+       fd.write("\n")
+
+       fd.write("Extraction Options:\n")
+       fd.write("\t-D, --dd=<type:ext[:cmd]>     Extract <type> signatures, give the files an extension of <ext>, and execute <cmd>\n")
+       fd.write("\t-e, --extract=[file]          Automatically extract known file types; load rules from file, if specified\n")
+       fd.write("\t-M, --matryoshka              Recursively scan extracted files, up to 8 levels deep\n")
+       fd.write("\t-r, --rm                      Cleanup extracted files and zero-size files\n")
+       fd.write("\t-d, --delay                   Delay file extraction for files with known footers\n")
+       fd.write("\n")
+
+       fd.write("Plugin Options:\n")
+       fd.write("\t-X, --disable-plugin=<name>   Disable a plugin by name\n")
+       fd.write("\t-Y, --enable-plugin=<name>    Enable a plugin by name\n")
+       fd.write("\t-p, --disable-plugins         Do not load any binwalk plugins\n")
+       fd.write("\t-L, --list-plugins            List all user and system plugins by name\n")
+       fd.write("\n")
+
+       fd.write("General Options:\n")  
+       fd.write("\t-o, --offset=<int>            Start scan at this file offset\n")
+       fd.write("\t-l, --length=<int>            Number of bytes to scan\n")
+       fd.write("\t-g, --grep=<text>             Grep results for the specified text\n")
+       fd.write("\t-f, --file=<file>             Log results to file\n")
+       fd.write("\t-c, --csv                     Log results to file in csv format\n")
+       fd.write("\t-O, --skip-unopened           Ignore file open errors and process only the files that can be opened\n")
+       fd.write("\t-t, --term                    Format output to fit the terminal window\n")
        fd.write("\t-q, --quiet                   Supress output to stdout\n")
        fd.write("\t-v, --verbose                 Be verbose (specify twice for very verbose)\n")
        fd.write("\t-u, --update                  Update magic signature files\n")
+       fd.write("\t-?, --examples                Show example usage\n")
        fd.write("\t-h, --help                    Show help output\n")
        fd.write("\n")
 
@@ -51,55 +123,96 @@ def usage(fd):
                sys.exit(0)
 
 def main():
+       # The Binwalk class instance must be global so that the display_status thread can access it.
+       global bwalk
+
        MIN_ARGC = 2
-       align = 1
+
+       requested_scans = []    
        offset = 0
        length = 0
-       quiet = False
-       pre_filter = True
+       strlen = 0
        verbose = 0
+       matryoshka = 1
+       entropy_block = 0
+       failed_open_count = 0
+       quiet = False
+       do_files = False
        log_file = None
+       do_csv = False
+       save_plot = False
+       show_plot = True
+       show_legend = True
+       entropy_scan = False
+       enable_plugins = True
        show_invalid = False
-       short_sig = True
+       entropy_algorithm = None
+       format_to_terminal = False
        custom_signature = None
        delay_extraction = False
        extract_rules_file = None
+       ignore_failed_open = False
        extract_from_config = False
        cleanup_after_extract = False
+       explicit_signature_scan = False
+       ignore_signature_keywords = False
        magic_flags = binwalk.magic.MAGIC_NONE
-       options = []
+       markers = []
        magic_files = []
+       file_opt_list = []
        target_files = []
        greps = []
-       includes = []
        excludes = []
        searches = []
        extracts = []
+       options = []
+       arguments = []
+       plugin_whitelist = []
+       plugin_blacklist = []
 
        config = binwalk.Config()
 
-       short_options = "aACdhkeqruvPIf:o:l:b:i:x:y:D:m:R:g:"
+       short_options = "AaBbCcdEehIJkLMNnOPpQqrStuv?D:F:f:g:K:o:l:m:R:s:X:x:Y:y:"
        long_options = [
                        "rm",
-                       "all",
-                       "help", 
+                       "help",
+                       "examples",
                        "quiet", 
+                       "csv",
                        "verbose",
                        "opcodes",
                        "cast",
                        "update",
+                       "binwalk", 
                        "keep-going",
                        "show-invalid",
                        "profile",
                        "delay",
+                       "skip-unopened",
+                       "term",
+                       "tim",
+                       "dumb",
+                       "entropy",
+                       "shannon",
+                       "save-plot",
+                       "no-plot",
+                       "no-legend", 
+                       "matryoshka",
+                       "strings",
+                       "list-plugins",
+                       "disable-plugins",
+                       "disable-plugin=",
+                       "enable-plugin=",
+                       "marker=",
+                       "strlen=",
                        "file=", 
+                       "block=",
                        "offset=", 
                        "length=", 
-                       "align=",
-                       "include=",
                        "exclude=",
-                       "extract=",
+                       "include=",
                        "search=",
+                       "extract=",
                        "dd=",
                        "grep=",
                        "magic=",
@@ -119,23 +232,45 @@ def main():
        for opt, arg in opts:
                if opt in ("-h", "--help"):
                        usage(sys.stdout)
+               elif opt in ("-?", "--examples"):
+                       examples()
                elif opt in ("-d", "--delay"):
                        delay_extraction = True
                elif opt in ("-f", "--file"):
                        log_file = arg
+               elif opt in ("-c", "--csv"):
+                       do_csv = True
                elif opt in ("-q", "--quiet"):
                        quiet = True
+               elif opt in ("-s", "--strlen"):
+                       strlen = binwalk.common.str2int(arg)
+               elif opt in ("-Q", "--no-legend"):
+                       show_legend = False
+               elif opt in ("-J", "--save-plot"):
+                       save_plot = True
+               elif opt in ("-E", "--entropy"):
+                       requested_scans.append(binwalk.Binwalk.ENTROPY)
+               elif opt in ("-a", "--shannon"):
+                       entropy_algorithm = 'shannon'
+               elif opt in("-t", "--term", "--tim"):
+                       format_to_terminal = True
+               elif opt in("-p", "--disable-plugins"):
+                       enable_plugins = False
+               elif opt in ("-b", "--dumb"):
+                       ignore_signature_keywords = True
                elif opt in ("-v", "--verbose"):
                        verbose += 1
+               elif opt in ("-N", "--no-plot"):
+                       show_plot = False
+               elif opt in ("-S", "--strings"):
+                       requested_scans.append(binwalk.Binwalk.STRINGS)
+               elif opt in ("-O", "--skip-unopened"):
+                       ignore_failed_open = True
                elif opt in ("-o", "--offset"):
-                       offset = str2int(arg)
+                       offset = binwalk.common.str2int(arg)
                elif opt in ("-l", "--length"):
-                       length = str2int(arg)
-               elif opt in ("-b", "--align"):
-                       align = str2int(arg)
-               elif opt in ("-i", "--include"):
-                       includes.append(arg)
-               elif opt in ("-y", "--search"):
+                       length = binwalk.common.str2int(arg)
+               elif opt in ("-y", "--search", "--include"):
                        searches.append(arg)
                elif opt in ("-x", "--exclude"):
                        excludes.append(arg)
@@ -143,46 +278,66 @@ def main():
                        extracts.append(arg)
                elif opt in ("-g", "--grep"):
                        greps.append(arg)
-               elif opt in ("-e", "--extract"):
-                       if arg:
-                               extract_rules_file = arg
-                       else:
-                               extract_from_config = True
                elif opt in ("-r", "--rm"):
                        cleanup_after_extract = True
                elif opt in ("-m", "--magic"):
                        magic_files.append(arg)
-               elif opt in ("-a", "--all"):
-                       short_sig = False
                elif opt in ("-k", "--keep-going"):
                        magic_flags |= binwalk.magic.MAGIC_CONTINUE
                elif opt in ("-I", "--show-invalid"):
                        show_invalid = True
-
+               elif opt in ("-B", "--binwalk"):
+                       requested_scans.append(binwalk.Binwalk.BINWALK)
+               elif opt in ("-M", "--matryoshka"):
+                       # Original Zvyozdochkin matrhoska set had 8 dolls. This is a good number.
+                       matryoshka = 8
+               elif opt in ("-K", "--block"):
+                       entropy_block = binwalk.common.str2int(arg)
+               elif opt in ("-X", "--disable-plugin"):
+                       plugin_blacklist.append(arg)
+               elif opt in ("-Y", "--enable-plugin"):
+                       plugin_whitelist.append(arg)
+
+               elif opt in ("-F", "--marker"):
+                       if ':' in arg:
+                               (location, description) = arg.split(':', 1)
+                               location = int(location)
+                               markers.append((location, [{'description' : description, 'offset' : location}]))
+               elif opt in("-L", "--list-plugins"):
+                       # List all user and system plugins, then exit
+                       print ''
+                       print 'NAME             TYPE       ENABLED    DESCRIPTION'
+                       print '-' * 115
+                       with binwalk.Binwalk() as bw:
+                               for (key, info) in binwalk.plugins.Plugins(bw).list_plugins().iteritems():
+                                       for module_name in info['modules']:
+                                               print '%-16s %-10s %-10s %s' % (module_name, key, info['enabled'][module_name], info['descriptions'][module_name])
+                       print ''
+                       sys.exit(1)
+               elif opt in ("-e", "--extract"):
+                       # If a file path was specified, use that as the extraction rules file
+                       if arg:
+                               extract_from_config = False
+                               extract_rules_file = arg
+                       # Else, use the default rules file
+                       else:
+                               extract_from_config = True
                elif opt in ("-A", "--opcodes"):
-                       # Check every single offset
-                       align = 1
-                       # Don't filter out short signatures as some opcode sigs are only 2 bytes
-                       short_sig = False
+                       requested_scans.append(binwalk.Binwalk.BINARCH)
                        # Load user file first so its signatures take precedence
                        magic_files.append(config.paths['user'][config.BINARCH_MAGIC_FILE])
                        magic_files.append(config.paths['system'][config.BINARCH_MAGIC_FILE])
                elif opt in ("-C", "--cast"):
-                       # Check every single offset
-                       align = 1
+                       requested_scans.append(binwalk.Binwalk.BINCAST)
                        # Don't stop at the first match (everything matches everything in this scan)
                        magic_flags |= binwalk.magic.MAGIC_CONTINUE
-                       # Disable all pre filtering; we want to check everything for this scan
-                       pre_filter = False
-                       # Don't filter shot signatures, or else some casts won't be displayed
-                       short_sig = False
                        # Load user file first so its signatures take precedence
                        magic_files.append(config.paths['user'][config.BINCAST_MAGIC_FILE])
                        magic_files.append(config.paths['system'][config.BINCAST_MAGIC_FILE])
                elif opt in ("-R", "--raw-bytes"):
-                       # Disable short signature filtering, as the supplied string may be short
-                       short_sig = False
                        custom_signature = arg
+                       requested_scans.append(binwalk.Binwalk.BINWALK)
+                       explicit_signature_scan = True
                elif opt in ("-u", "--update"):
                        try:
                                sys.stdout.write("Updating signatures...")
@@ -198,31 +353,56 @@ def main():
                                else:
                                        sys.stderr.write('\n' + str(e) + '\n')
                                sys.exit(1)
+               
                # The --profile option is handled prior to calling main()
                elif opt not in ('-P', '--profile'):
                        usage(sys.stderr)
 
-               # Append the option and argument to the list of processed options
-               # This is used later to determine which argv entries are file names
+               # Keep track of the options and arguments.
+               # This is used later to determine which argv entries are file names.
                options.append(opt)
-               options.append(arg)
                options.append("%s%s" % (opt, arg))
                options.append("%s=%s" % (opt, arg))
-
-       # Treat any command line options not processed by getopt as target file paths
+               arguments.append(arg)
+               
+       # Treat any command line options not processed by getopt as target file paths.
        for opt in sys.argv[1:]:
-               #TODO: Do we really want to not process valid files that start with a '-'?
-               #      This is probably OK, and ensures that no options are treated as target files.
-               if opt not in options and not opt.startswith('-'):
-                       target_files.append(opt)
+               if opt not in arguments and opt not in options and not opt.startswith('-'):
+                       file_opt_list.append(opt)
+
+       # Validate the target files listed in target_files
+       for tfile in file_opt_list:
+               # Ignore directories.
+               if not os.path.isdir(tfile):
+                       # Make sure we can open the target files
+                       try:
+                               fd = open(tfile, "rb")
+                               fd.close()
+                               target_files.append(tfile)
+                       except Exception, e:
+                               sys.stdout.write("Cannot open file : %s\n" % str(e))
+                               failed_open_count += 1
+
+       # Unless -O was specified, don't run the scan unless we are able to scan all specified files
+       if failed_open_count > 0 and not ignore_failed_open:
+               if failed_open_count > 1:
+                       plural = 's'
+               else:
+                       plural = ''
+               sys.stdout.write("Failed to open %d file%s for scanning, quitting...\n" % (failed_open_count, plural))
+               sys.exit(1)
 
        # If more than one target file was specified, enable verbose mode; else, there is
        # nothing in the output to indicate which scan corresponds to which file.
-       if len(target_files) > 1:
-               verbose = True
+       if (matryoshka > 1 or len(target_files) > 1):
+               save_plot = True
+               if not verbose:
+                       verbose = 1
+       elif len(target_files) == 0:
+               usage(sys.stderr)
 
        # Instantiate the Binwalk class
-       bwalk = binwalk.Binwalk(flags=magic_flags, verbose=verbose, log=log_file, quiet=quiet)
+       bwalk = binwalk.Binwalk(flags=magic_flags, verbose=verbose, log=log_file, quiet=quiet, ignore_smart_keywords=ignore_signature_keywords, load_plugins=enable_plugins)
 
        # If a custom signature was specified, create a temporary magic file containing the custom signature
        # and ensure that it is the only magic file that will be loaded when Binwalk.scan() is called.
@@ -230,7 +410,6 @@ def main():
                magic_files = bwalk.parser.file_from_string(custom_signature)
 
        # Set any specified filters
-       bwalk.filter.include(includes, exclusive=False)
        bwalk.filter.exclude(excludes)
        bwalk.filter.include(searches)
        bwalk.filter.grep(filters=greps)
@@ -253,32 +432,99 @@ def main():
        bwalk.extractor.enable_delayed_extract(delay_extraction)
 
        # Load the magic file(s)
-       bwalk.load_signatures(magic_files=magic_files, pre_filter_signatures=pre_filter, filter_short_signatures=short_sig)
-       
-       # Scan each target file
-       for target_file in target_files:
-               bwalk.display.header(target_file)
+       bwalk.load_signatures(magic_files=magic_files)
 
-               # Start the display_status function as a daemon thread
-               t = Thread(target=display_status, args=(bwalk,))
-               t.setDaemon(True)
-               t.start()
+       # If --term was specified, enable output formatting to terminal
+       if format_to_terminal:
+               bwalk.display.enable_formatting(True)
 
-               # Catch keyboard interrupts so that we can properly clean up after the scan
-               try:
-                       bwalk.scan(target_file, 
-                               offset=offset, 
-                               length=length, 
-                               align=align,
-                               show_invalid_results=show_invalid, 
-                               callback=bwalk.display.results)
-               except KeyboardInterrupt:
-                       pass
+       # Enable log file CSV formatting, if specified
+       if do_csv:
+               bwalk.display.enable_csv()
+
+       # If no scan was explicitly rquested, do a binwalk scan
+       if not requested_scans:
+               requested_scans.append(binwalk.Binwalk.BINWALK)
 
-               bwalk.display.footer()
+       # Sort the scan types to ensure the entropy scan is performed last
+       requested_scans.sort()
 
-       # Be sure to drink your ovaltine.
-       # And also to clean up any temporary magic files.
+       # Everything is set up, let's do a scan
+       try:
+               results = {}
+
+               # Start the display_status function as a daemon thread.
+               t = Thread(target=display_status)
+               t.setDaemon(True)
+               t.start()
+               
+               for scan_type in requested_scans:
+
+                       if scan_type in [binwalk.Binwalk.BINWALK, binwalk.Binwalk.BINARCH, binwalk.Binwalk.BINCAST]:
+
+                               # There's no generic way for the binwalk class to know what
+                               # scan type is being run, since these are all signature scans,
+                               # just with different magic files. Manually set the scan sub-type
+                               # here to ensure that plugins can differentiate between the
+                               # scans being performed.
+                               bwalk.scan_type = scan_type
+
+                               r = bwalk.scan(target_files,
+                                               offset=offset, 
+                                               length=length, 
+                                               show_invalid_results=show_invalid, 
+                                               callback=bwalk.display.results, 
+                                               start_callback=bwalk.display.header,
+                                               end_callback=bwalk.display.footer,
+                                               matryoshka=matryoshka,
+                                               plugins_whitelist=plugin_whitelist,
+                                               plugins_blacklist=plugin_blacklist)
+
+                               bwalk.concatenate_results(results, r)
+
+                       elif scan_type == binwalk.Binwalk.STRINGS:
+
+                               r = bwalk.analyze_strings(target_files, 
+                                                       length=length, 
+                                                       offset=offset, 
+                                                       n=strlen, 
+                                                       block=entropy_block, 
+                                                       algorithm=entropy_algorithm,
+                                                       load_plugins=enable_plugins, 
+                                                       whitelist=plugin_whitelist, 
+                                                       blacklist=plugin_blacklist)
+                                       
+                               bwalk.concatenate_results(results, r)
+
+                       elif scan_type == binwalk.Binwalk.ENTROPY:
+
+                               if not results:
+                                       for target_file in target_files:
+                                               results[target_file] = []
+                               else:
+                                       bwalk.display.quiet = True
+                                       bwalk.display.cleanup()
+
+                               for target_file in results.keys():
+                                       bwalk.concatenate_results(results, {target_file : markers})
+
+                               bwalk.analyze_entropy(results,
+                                                       offset, 
+                                                       length, 
+                                                       entropy_block, 
+                                                       show_plot, 
+                                                       show_legend, 
+                                                       save_plot,
+                                                       algorithm=entropy_algorithm,
+                                                       load_plugins=enable_plugins,
+                                                       whitelist=plugin_whitelist,
+                                                       blacklist=plugin_blacklist)
+
+       except KeyboardInterrupt:
+               pass
+#      except Exception, e:
+#              print "Unexpected error:", str(e)
+               
        bwalk.cleanup()
 
 try:
index 742ec4b..61d17b3 100644 (file)
@@ -1,19 +1,25 @@
+__all__ = ["Binwalk"]
+
 import os
+import re
 import magic
 from config import *
 from update import *
 from filter import *
 from parser import *
-from smartsig import *
+from plugins import *
+from entropy import *
 from extractor import *
 from prettyprint import *
-from common import file_size
+from smartstrings import *
+from smartsignature import *
+from common import file_size, unique_file_name
 
-class Binwalk:
+class Binwalk(object):
        '''
        Primary Binwalk class.
 
-       Interesting class objects:
+       Useful class objects:
 
                self.filter        - An instance of the MagicFilter class.
                self.extractor     - An instance of the Extractor class.
@@ -22,6 +28,18 @@ class Binwalk:
                self.magic_files   - A list of magic file path strings to use whenever the scan() method is invoked.
                self.scan_length   - The total number of bytes to be scanned.
                self.total_scanned - The number of bytes that have already been scanned.
+               self.scan_type     - The type of scan being performed, one of: BINWALK, BINCAST, BINARCH, STRINGS, ENTROPY.
+
+       Performing a simple binwalk scan:
+
+               from binwalk import Binwalk
+                       
+               scan = Binwalk().scan(['firmware1.bin', 'firmware2.bin'])
+               for (filename, file_results) in scan.iteritems():
+                       print "Results for %s:" % filename
+                       for (offset, results) in file_results:
+                               for result in results:
+                                       print offset, result['description']
        '''
 
        # Default libmagic flags. Basically disable anything we don't need in the name of speed.
@@ -33,10 +51,11 @@ class Binwalk:
        # Passing the entire remaining buffer to libmagic is resource intensive and will
        # significantly slow the scan; this value represents a reasonable buffer size to
        # pass to libmagic which will not drastically affect scan time.
-       MAX_SIGNATURE_SIZE = 8092
+       MAX_SIGNATURE_SIZE = 8 * 1024
 
-       # Max number of bytes to process at one time. Everyone should have 50MB of memory, right?
-       READ_BLOCK_SIZE = 50 * 1024 * 1024
+       # Max number of bytes to process at one time. This needs to be large enough to 
+       # limit disk I/O, but small enough to limit the size of processed data blocks.
+       READ_BLOCK_SIZE = 1 * 1024 * 1024
 
        # Minimum verbosity level at which to enable extractor verbosity.
        VERY_VERBOSE = 2
@@ -44,26 +63,44 @@ class Binwalk:
        # Scan every byte by default.
        DEFAULT_BYTE_ALIGNMENT = 1
 
-       def __init__(self, magic_files=[], flags=magic.MAGIC_NONE, log=None, quiet=False, verbose=0):
+       # Valid scan_type values.
+       # ENTROPY must be the largest value to ensure it is performed last if multiple scans are performed.
+       BINWALK = 0x01
+       BINARCH = 0x02
+       BINCAST = 0x04
+       STRINGS = 0x08
+       ENTROPY = 0x10
+
+       def __init__(self, magic_files=[], flags=magic.MAGIC_NONE, log=None, quiet=False, verbose=0, ignore_smart_keywords=False, load_extractor=False, load_plugins=True):
                '''
                Class constructor.
 
-               @magic_files - A list of magic files to use.
-               @flags       - Flags to pass to magic_open. [TODO: Might this be more appropriate as an argument to load_signaures?]
-               @log         - Output PrettyPrint data to log file as well as to stdout.
-               @quiet       - If set to True, supress PrettyPrint output to stdout.
-               @verbose     - Verbosity level.
+               @magic_files            - A list of magic files to use.
+               @flags                  - Flags to pass to magic_open. [TODO: Might this be more appropriate as an argument to load_signaures?]
+               @log                    - Output PrettyPrint data to log file as well as to stdout.
+               @quiet                  - If set to True, supress PrettyPrint output to stdout.
+               @verbose                - Verbosity level.
+               @ignore_smart_keywords  - Set to True to ignore smart signature keywords.
+               @load_extractor         - Set to True to load the default extraction rules automatically.
+               @load_plugins           - Set to False to disable plugin support.
 
                Returns None.
                '''
                self.flags = self.DEFAULT_FLAGS | flags
+               self.last_extra_data_section = ''
+               self.load_plugins = load_plugins
                self.magic_files = magic_files
                self.verbose = verbose
                self.total_scanned = 0
                self.scan_length = 0
                self.total_read = 0
+               self.matryoshka = 1
+               self.plugins = None
                self.magic = None
                self.mfile = None
+               self.entropy = None
+               self.strings = None
+               self.scan_type = self.BINWALK
 
                # Instantiate the config class so we can access file/directory paths
                self.config = Config()
@@ -76,7 +113,6 @@ class Binwalk:
                                        self.config.paths['system'][self.config.BINWALK_MAGIC_FILE],
                        ]
 
-
                # Only set the extractor verbosity if told to be very verbose
                if self.verbose >= self.VERY_VERBOSE:
                        extractor_verbose = True
@@ -84,7 +120,7 @@ class Binwalk:
                        extractor_verbose = False
 
                # Create an instance of the PrettyPrint class, which can be used to print results to screen/file.
-               self.display = PrettyPrint(log=log, quiet=quiet, verbose=verbose, bwalk=self)
+               self.display = PrettyPrint(self, log=log, quiet=quiet, verbose=verbose)
 
                # Create MagicFilter and Extractor class instances. These can be used to:
                #
@@ -93,15 +129,20 @@ class Binwalk:
                #
                self.filter = MagicFilter()
                self.extractor = Extractor(verbose=extractor_verbose)
-               
+               if load_extractor:
+                       self.extractor.load_defaults()
+
                # Create SmartSignature and MagicParser class instances. These are mostly for internal use.
-               self.smart = SmartSignature(self.filter)
+               self.smart = SmartSignature(self.filter, ignore_smart_signatures=ignore_smart_keywords)
                self.parser = MagicParser(self.filter, self.smart)
 
        def __del__(self):
-               '''
-               Class deconstructor.
-               '''
+               self.cleanup()
+
+       def __enter__(self):
+               return self
+
+       def __exit__(self, t, v, traceback):
                self.cleanup()
 
        def cleanup(self):
@@ -115,40 +156,229 @@ class Binwalk:
                except:
                        pass
 
-       def load_signatures(self, magic_files=[], pre_filter_signatures=True, filter_short_signatures=True):
+       def load_signatures(self, magic_files=[]):
                '''
                Load signatures from magic file(s).
                Called automatically by Binwalk.scan() with all defaults, if not already called manually.
 
-               @magic_files                    - A list of magic files to use (default: self.magic_files).
-               @pre_filter_signatures          - Set to False to disable pre-filtering of signatures before invoking libmagic.
-               @filter_short_signatures        - Set to True to include signatures with short (<= 2 byte) magic strings.
+               @magic_files - A list of magic files to use (default: self.magic_files).
        
                Returns None.   
                '''
-               # Disable pre filtering in the smart signature class instance.
-               # This is also checked by Binwalk.scan() before performing pre-filtering.
-               self.smart.pre_filter = pre_filter_signatures
-
                # The magic files specified here override any already set
                if magic_files and magic_files is not None:
                        self.magic_files = magic_files
 
                # Parse the magic file(s) and initialize libmagic
-               self.mfile = self.parser.parse(self.magic_files, filter_short_signatures=filter_short_signatures, pre_filter_signatures=pre_filter_signatures)
+               self.mfile = self.parser.parse(self.magic_files)
                self.magic = magic.open(self.flags)
                self.magic.load(self.mfile)
 
-       def scan(self, target_file, offset=0, length=0, align=DEFAULT_BYTE_ALIGNMENT, show_invalid_results=False, callback=None):
+       def analyze_strings(self, file_names, length=0, offset=0, n=0, block=0, algorithm=None, load_plugins=True, whitelist=[], blacklist=[]):
                '''
-               Performs a Binwalk scan on the target file.
+               Performs a strings analysis on the specified file(s).
+
+               @file_names   - A list of files to analyze.
+               @length       - The number of bytes in the file to analyze.
+               @offset       - The starting offset into the file to begin analysis.
+               @n            - The minimum valid string length.
+               @block        - The block size to use when performing entropy analysis.
+               @algorithm    - The entropy algorithm to use when performing entropy analysis.
+               @load_plugins - Set to False to disable plugin callbacks.
+               @whitelist    - A list of whitelisted plugins.
+               @blacklist    - A list of blacklisted plugins.
+               
+               Returns a dictionary compatible with other classes and methods (Entropy, Binwalk, analyze_entropy, etc):
+
+                       {
+                               'file_name' : (offset, [{
+                                                               'description' : 'Strings',
+                                                               'string'      : 'found_string'
+                                                       }]
+                                       )
+                       }
+               '''
+               data = {}
+
+               self.strings = Strings(file_names,
+                                       self,
+                                       length=length, 
+                                       offset=offset,
+                                       n=n,
+                                       block=block,
+                                       algorithm=algorithm,
+                                       load_plugins=load_plugins,
+                                       whitelist=whitelist,
+                                       blacklist=blacklist)
+
+               data = self.strings.strings()
+               
+               del self.strings
+               self.strings = None
+
+               return data
+
+       def analyze_entropy(self, files, offset=0, length=0, block=0, plot=True, legend=True, save=False, algorithm=None, load_plugins=True, whitelist=[], blacklist=[]):
+                '''
+               Performs an entropy analysis on the specified file(s).
+
+               @files        - A dictionary containing file names and results data, as returned by Binwalk.scan.
+               @offset       - The offset into the data to begin analysis.
+               @length       - The number of bytes to analyze.
+               @block        - The size of the data blocks to analyze.
+               @plot         - Set to False to disable plotting.
+               @legend       - Set to False to exclude the legend and custom offset markers from the plot.
+               @save         - Set to True to save plots to disk instead of displaying them.
+               @algorithm    - Set to 'shannon' to use shannon entropy algorithm.
+               @load_plugins - Set to False to disable plugin callbacks.
+               @whitelist    - A list of whitelisted plugins.
+               @blacklist    - A list of blacklisted plugins.
+
+               Returns a dictionary of:
+                        
+                       {
+                               'file_name' : ([list, of, offsets], [list, of, entropy], average_entropy)
+                       }
+               '''
+               data = {}
+
+               self.entropy = Entropy(files,
+                                       self,
+                                       offset,
+                                       length,
+                                       block,
+                                       plot,
+                                       legend,
+                                       save,
+                                       algorithm=algorithm,
+                                       load_plugins=plugins,
+                                       whitelist=whitelist,
+                                       blacklist=blacklist)
+               
+               data = self.entropy.analyze()
+               
+               del self.entropy
+               self.entropy = None
+
+               return data
+
+       def scan(self, target_files, offset=0, length=0, show_invalid_results=False, callback=None, start_callback=None, end_callback=None, base_dir=None, matryoshka=1, plugins_whitelist=[], plugins_blacklist=[]):
+               '''
+               Performs a binwalk scan on a file or list of files.
+
+               @target_files         - File or list of files to scan.
+               @offset               - Starting offset at which to start the scan.
+                @length               - Number of bytes to scan. Specify -1 for streams.
+                @show_invalid_results - Set to True to display invalid results.
+                @callback             - Callback function to be invoked when matches are found.
+               @start_callback       - Callback function to be invoked prior to scanning each file.
+               @end_callback         - Callback function to be invoked after scanning each file.
+               @base_dir             - Base directory for output files.
+               @matryoshka           - Number of levels to traverse into the rabbit hole.
+               @plugins_whitelist    - A list of plugin names to load. If not empty, only these plugins will be loaded.
+               @plugins_blacklist    - A list of plugin names to not load.
+
+               Returns a dictionary of :
+
+                       {
+                               'target file name' : [
+                                                       (0, [{description : "LZMA compressed data..."}]),
+                                                       (112, [{description : "gzip compressed data..."}])
+                               ]
+                       }
+               '''
+               # Prefix all directory names with an underscore. This prevents accidental deletion of the original file(s)
+               # when the user is typing too fast and is trying to deleted the extraction directory.
+               prefix = '_'
+               dir_extension = 'extracted'
+               i = 0
+               total_results = {}
+               self.matryoshka = matryoshka
+
+               # For backwards compatibility
+               if not isinstance(target_files, type([])):
+                       target_files = [target_files]
+
+               if base_dir is None:
+                       base_dir = ''
+
+               # Instantiate the Plugins class and load all plugins, if not disabled
+               self.plugins = Plugins(self, whitelist=plugins_whitelist, blacklist=plugins_blacklist)
+               if self.load_plugins:
+                       self.plugins._load_plugins()
+
+               while i < self.matryoshka:
+                       new_target_files = []
+
+                       # Scan each target file
+                       for target_file in target_files:
+                               ignore_files = []
+
+                               # On the first scan, add the base_dir value to dir_prefix. Subsequent target_file values will have this value prepended already.
+                               if i == 0:
+                                       dir_prefix = os.path.join(base_dir, prefix + os.path.basename(target_file))
+                               else:
+                                       dir_prefix = os.path.join(os.path.dirname(target_file), prefix + os.path.basename(target_file))
+
+                               output_dir = unique_file_name(dir_prefix, dir_extension)
+
+                               # Set the output directory for extracted files to go to
+                               self.extractor.output_directory(output_dir)
 
-               @target_file                    - File to scan.
-               @offset                         - Starting offset at which to start the scan.
-               @length                         - Number of bytes to scan.
-               @align                          - Look for signatures every align bytes.
-               @show_invalid_results           - Set to True to display invalid results.
-               @callback                       - Callback function to be invoked when matches are found.
+                               if start_callback is not None:
+                                       start_callback(target_file)
+       
+                               results = self.single_scan(target_file, 
+                                                       offset=offset, 
+                                                       length=length, 
+                                                       show_invalid_results=show_invalid_results,
+                                                       callback=callback)
+       
+                               if end_callback is not None:
+                                       end_callback(target_file)
+
+                               # Get a list of extracted file names; don't scan them again.
+                               for (index, results_list) in results:
+                                       for result in results_list:
+                                               if result['extract']:
+                                                       ignore_files.append(result['extract'])
+
+                               # Find all newly created files and add them to new_target_files / new_target_directories
+                               for (dir_path, sub_dirs, files) in os.walk(output_dir):
+                                       for fname in files:
+                                               fname = os.path.join(dir_path, fname)
+                                               if fname not in ignore_files:
+                                                       new_target_files.append(fname)
+
+                                       # Don't worry about sub-directories
+                                       break
+
+                               total_results[target_file] = results
+
+                       target_files = new_target_files
+                       i += 1
+
+               # Be sure to delete the Plugins instance so that there isn't a lingering reference to
+               # this Binwalk class instance (lingering handles to this Binwalk instance cause the
+               # __del__ deconstructor to not be called).
+               if self.plugins is not None:
+                       del self.plugins
+                       self.plugins = None
+
+               return total_results
+
+       def single_scan(self, target_file='', fd=None, offset=0, length=0, show_invalid_results=False, callback=None, plugins_whitelist=[], plugins_blacklist=[]):
+               '''
+               Performs a binwalk scan on one target file or file descriptor.
+
+               @target_file          - File to scan.
+               @fd                   - File descriptor to scan.
+               @offset               - Starting offset at which to start the scan.
+               @length               - Number of bytes to scan. Specify -1 for streams.
+               @show_invalid_results - Set to True to display invalid results.
+               @callback             - Callback function to be invoked when matches are found.
+               @plugins_whitelist    - A list of plugin names to load. If not empty, only these plugins will be loaded.
+               @plugins_blacklist    - A list of plugin names to not load.
 
                The callback function is passed two arguments: a list of result dictionaries containing the scan results
                (one result per dict), and the offset at which those results were identified. Example callback function:
@@ -163,7 +393,7 @@ class Binwalk:
                Upon completion, the scan method returns a sorted list of tuples containing a list of results dictionaries
                and the offsets at which those results were identified:
 
-                       scan_items = [
+                       scan_results = [
                                        (0, [{description : "LZMA compressed data..."}]),
                                        (112, [{description : "gzip compressed data..."}])
                        ]
@@ -171,35 +401,58 @@ class Binwalk:
                See SmartSignature.parse for a more detailed description of the results dictionary structure.
                '''
                scan_results = {}
+               fsize = 0
+               jump_offset = 0
+               i_opened_fd = False
+               i_loaded_plugins = False
+               plugret = PLUGIN_CONTINUE
+               plugret_start = PLUGIN_CONTINUE
                self.total_read = 0
                self.total_scanned = 0
                self.scan_length = length
                self.filter.show_invalid_results = show_invalid_results
 
+               # Check to make sure either a target file or a file descriptor was supplied
+               if not target_file and fd is None:
+                       raise Exception("Must supply Binwalk.single_scan with a valid file path or file object")
+
                # Load the default signatures if self.load_signatures has not already been invoked
                if self.magic is None:
                        self.load_signatures()
 
-               # Get a local copy of the signature sets generated by self.parser.build_signature_set.
-               # This is accessed heavily throughout the scan, and there is less overhead for accessing local variables in Python.
-               signature_set = self.parser.build_signature_set()
-
                # Need the total size of the target file, even if we aren't scanning the whole thing
-               fsize = file_size(target_file)
-
+               if target_file:
+                       fsize = file_size(target_file)
+                       
                # Open the target file and seek to the specified start offset
-               fd = open(target_file)
-               fd.seek(offset)
+               if fd is None:
+                       fd = open(target_file)
+                       i_opened_fd = True
+       
+               # Seek to the starting offset. This is invalid for some file-like objects such as stdin,
+               # so if an exception is thrown try reading offset bytes from the file object.   
+               try:    
+                       fd.seek(offset)
+               except:
+                       fd.read(offset)
                
                # If no length was specified, make the length the size of the target file minus the starting offset
                if self.scan_length == 0:
                        self.scan_length = fsize - offset
-               # Sanity check on the byte alignment; default to 1
-               if align <= 0:
-                       align = 1
 
+               # If the Plugins class has not already been instantitated, do that now.
+               if self.plugins is None:
+                       self.plugins = Plugins(self, blacklist=plugins_blacklist, whitelist=plugins_whitelist)
+                       i_loaded_plugins = True
+               
+                       if self.load_plugins:
+                               self.plugins._load_plugins()
+
+               # Invoke any pre-scan plugins
+               plugret_start = self.plugins._pre_scan_callbacks(fd)
+               
                # Main loop, scan through all the data
-               while True:
+               while not ((plugret | plugret_start) & PLUGIN_TERMINATE):
                        i = 0
 
                        # Read in the next block of data from the target file and make sure it's valid
@@ -208,51 +461,49 @@ class Binwalk:
                                break
 
                        # The total number of bytes scanned could be bigger than the total number
-                       # of bytes read from the file under the following circumstances:
-                       #
-                       #       o The previous dlen was not a multiple of align
-                       #       o A previous result specified a jump offset that was beyond the end of the
-                       #         then current data block
+                       # of bytes read from the file if the previous signature result specified a 
+                       # jump offset that was beyond the end of the then current data block.
                        #
                        # If this is the case, we need to index into this data block appropriately in order to 
                        # resume the scan from the appropriate offset, and adjust dlen accordingly.
-                       bufindex = self.total_scanned - self.total_read
-                       if bufindex > 0:
-                               # If the total_scanned > total_read, then the total_scanned offset is in a subsequent block.
-                               # Set i to bufindex, which will cause i to be greater than dlen and this block will be skipped.
-                               i = bufindex
-                       elif bufindex < 0:
-                               # If the total_scanned offset is less than total_read, then the total_scanned offset is
-                               # somewhere inside this block. Set i to index into the block appropriately.
-                               i = dlen + bufindex
-                       else:
-                               # If the total_scanned offset ends at the end of this block, don't scan any of this block
-                               i = dlen
+                       if jump_offset > 0:
+                               total_check = self.total_scanned + dlen
+
+                               if jump_offset >= total_check:
+                                       i = -1
+                                       
+                                       # Try to seek to the jump offset; this won't work if fd == sys.stdin
+                                       try:
+                                               fd.seek(jump_offset)
+                                               self.total_read = jump_offset
+                                               self.total_scanned = jump_offset - dlen
+                                               jump_offset = 0
+                                       except:
+                                               pass
+                               elif jump_offset < total_check:
+                                       # Index into this block appropriately
+                                       i = jump_offset - self.total_scanned
+                                       jump_offset = 0
 
                        # Scan through each block of data looking for signatures
-                       while i < dlen:
-                               smart = {}
-                               results = []
-                               results_offset = -1
-                               pre_filter_ok = False
-                               smart_jump_done = False
-
-                               # Pre-filter data by checking to see if the parser thinks this might be a valid match.
-                               # This eliminates unnecessary calls into libmagic, which are very expensive.
-                               #
-                               # Ideally, this should be done in the MagicParser class, but function calls are expensive.
-                               # Doing it here greatly decreases the scan time.
-                               if self.smart.pre_filter:
-                                       for (sig_offset, sigset) in signature_set:
-                                               if data[i+sig_offset:i+sig_offset+self.parser.MATCH_INDEX_SIZE] in sigset:
-                                                       pre_filter_ok = True
-                                                       break
-                               else:
-                                       pre_filter_ok = True
+                       if i >= 0 and i < dlen:
+
+                               # Scan this data block for a list of offsets which are candidates for possible valid signatures
+                               for candidate in self.parser.find_signature_candidates(data[i:dlen]):
+
+                                       # If a signature specified a jump offset beyond this candidate signature offset, ignore it
+                                       if (i + candidate + self.total_scanned) < jump_offset:
+                                               continue
+
+                                       # Reset these values on each loop       
+                                       smart = {}
+                                       results = []
+                                       results_offset = -1
 
-                               if pre_filter_ok:
                                        # Pass the data to libmagic, and split out multiple results into a list
-                                       for magic_result in self.parser.split(self.magic.buffer(data[i:i+self.MAX_SIGNATURE_SIZE])):
+                                       for magic_result in self.parser.split(self.magic.buffer(data[i+candidate:i+candidate+self.MAX_SIGNATURE_SIZE])):
+
+                                               i_set_results_offset = False
 
                                                # Some file names are not NULL byte terminated, but rather their length is
                                                # specified in a size field. To ensure these are not marked as invalid due to
@@ -271,48 +522,52 @@ class Binwalk:
                                                                # the calculated results offset will be wrong since i will have been incremented. Only set the
                                                                # results_offset value when the first match is encountered.
                                                                if results_offset < 0:
-                                                                       results_offset = offset + smart['adjust'] + self.total_scanned
+                                                                       results_offset = offset + i + candidate + smart['adjust'] + self.total_scanned
+                                                                       i_set_results_offset = True
 
                                                                # Double check to make sure the smart['adjust'] value is sane. 
                                                                # If it makes results_offset negative, then it is not sane.
                                                                if results_offset >= 0:
+                                                                       smart['offset'] = results_offset
+
+                                                                       # Invoke any scan plugins 
+                                                                       if not (plugret_start & PLUGIN_STOP_PLUGINS):
+                                                                               plugret = self.plugins._scan_callbacks(smart)
+                                                                               results_offset = smart['offset']
+                                                                               if (plugret & PLUGIN_TERMINATE):
+                                                                                       break
+
                                                                        # Extract the result, if it matches one of the extract rules and is not a delayed extract.
-                                                                       if self.extractor.enabled and not (self.extractor.delayed and smart['delay']):
+                                                                       if self.extractor.enabled and not (self.extractor.delayed and smart['delay']) and not ((plugret | plugret_start) & PLUGIN_NO_EXTRACT):
                                                                                # If the signature did not specify a size, extract to the end of the file.
-                                                                               if smart['size'] == 0:
+                                                                               if not smart['size']:
                                                                                        smart['size'] = fsize-results_offset
-
+                                                                               
                                                                                smart['extract'] = self.extractor.extract(      results_offset, 
                                                                                                                                smart['description'], 
                                                                                                                                target_file, 
                                                                                                                                smart['size'], 
                                                                                                                                name=smart['name'])
 
-                                                                       # This appears to be a valid result, so append it to the results list.
-                                                                       results.append(smart)
-
-                                                       # Jump to the offset specified by jump. Only do this once, so that if multiple results
-                                                       # are returned each of which specify a jump offset, only the first will be honored.
-                                                       if smart['jump'] > 0 and not smart_jump_done:
-                                                               # Once a jump offset has been honored, we need to start scanning every byte since the
-                                                               # jump offset may have thrown off the original alignment. In terms of speed this is fine,
-                                                               # since the jump offset usually saves more time anyway. If this is not what the user
-                                                               # wanted/intended, disabling pre filtering will disable jump offset processing completely.
-                                                               align = self.DEFAULT_BYTE_ALIGNMENT
-                                                               smart_jump_done = True
-                                                               i += (smart['jump'] - align)
-                                                               self.total_scanned += (smart['jump'] - align)
-
-                               # Did we find any valid results?
-                               if results_offset >= 0:
-                                       scan_results[results_offset] = results
+                                                                       if not ((plugret | plugret_start) & PLUGIN_NO_DISPLAY):
+                                                                               # This appears to be a valid result, so append it to the results list.
+                                                                               results.append(smart)
+                                                                       elif i_set_results_offset:
+                                                                               results_offset = -1
+
+                                       # Did we find any valid results?
+                                       if results_offset >= 0:
+                                               scan_results[results_offset] = results
                                        
-                                       if callback is not None:
-                                               callback(results_offset, results)
+                                               if callback is not None:
+                                                       callback(results_offset, results)
+                       
+                                               # If a relative jump offset was specified, update the absolute jump_offset variable
+                                               if smart.has_key('jump') and smart['jump'] > 0:
+                                                       jump_offset = results_offset + smart['jump']
 
-                               # Track the number of bytes scanned in this block, and the total number of bytes scanned.       
-                               i += align
-                               self.total_scanned += align
+                       # Track the total number of bytes scanned
+                       self.total_scanned += dlen
 
                # Sort the results before returning them
                scan_items = scan_results.items()
@@ -322,8 +577,50 @@ class Binwalk:
                if self.extractor.enabled and self.extractor.delayed:
                        scan_items = self.extractor.delayed_extract(scan_items, target_file, fsize)
 
+               # Invoke any post-scan plugins
+               #if not (plugret_start & PLUGIN_STOP_PLUGINS):
+               self.plugins._post_scan_callbacks(fd)
+
+               # Be sure to delete the Plugins instance so that there isn't a lingering reference to
+               # this Binwalk class instance (lingering handles to this Binwalk instance cause the
+               # __del__ deconstructor to not be called).
+               if i_loaded_plugins:
+                       del self.plugins
+                       self.plugins = None
+
+               if i_opened_fd:
+                       fd.close()
+
                return scan_items
 
+       def concatenate_results(self, results, new):
+               '''
+               Concatenate multiple Binwalk.scan results into one dictionary.
+
+               @results - Binwalk results to append new results to.
+               @new     - New data to append to results.
+
+               Returns None.
+               '''
+               for (new_file_name, new_data) in new.iteritems():
+                       if not results.has_key(new_file_name):
+                               results[new_file_name] = new_data
+                       else:
+                               for i in range(0, len(new_data)):
+                                       found_offset = False
+                                       (new_offset, new_results_list) = new_data[i]
+
+                                       for j in range(0, len(results[new_file_name])):
+                                               (offset, results_list) = results[new_file_name][j]
+                                               if offset == new_offset:
+                                                       results_list += new_results_list
+                                                       results[new_file_name][j] = (offset, results_list)
+                                                       found_offset = True
+                                                       break
+                                       
+                                       if not found_offset:
+                                               results[new_file_name] += new_data
+
        def _should_display(self, data):
                '''
                Determines if a result string should be displayed to the user or not.
@@ -350,16 +647,18 @@ class Binwalk:
                # end of the returned data in case a signature is found at or near data[dlen].
                rlen = self.READ_BLOCK_SIZE + self.MAX_SIGNATURE_SIZE
 
-               if self.total_read < self.scan_length:
+               # Check to make sure we only read up to scan_length bytes (streams have a scan length of -1)
+               if self.scan_length == -1 or self.total_read < self.scan_length:
                        
-                       data = fd.read(rlen)
+                       # Read in the next rlen bytes, plus any extra data from the previous read (only neeced for streams)
+                       data = self.last_extra_data_section + fd.read(rlen - len(self.last_extra_data_section))
                        
                        if data and data is not None:
                                # Get the actual length of the read in data
                                dlen = len(data)
 
                                # If we've read in more data than the scan length, truncate the dlen value
-                               if (self.total_read + dlen) >= self.scan_length:
+                               if self.scan_length != -1 and (self.total_read + dlen) >= self.scan_length:
                                        dlen = self.scan_length - self.total_read
                                # If dlen is the expected rlen size, it should be set to READ_BLOCK_SIZE
                                elif dlen == rlen:
@@ -369,8 +668,14 @@ class Binwalk:
                                # for processing (actual read size is larger of course, due to the MAX_SIGNATURE_SIZE
                                # buffer of data at the end of each block).
                                self.total_read += dlen
-                               # Seek to the self.total_read offset so the next read can pick up where this one left off
-                               fd.seek(self.total_read)
+
+                               # Seek to the self.total_read offset so the next read can pick up where this one left off.
+                               # If fd is a stream, this seek will fail; keep a copy of the extra buffer data so that it
+                               # can be added to the data buffer the next time this method is invoked.
+                               try:
+                                       fd.seek(self.total_read)
+                               except:
+                                       self.last_extra_data_section = data[dlen:]
 
                return (data, dlen)
 
index c4ca2d0..5ad52f6 100644 (file)
@@ -66,3 +66,26 @@ def get_quoted_strings(string):
                return re.findall(r'\"(.*)\"', string)[0]
        except:
                return ''
+
+def unique_file_name(base_name, extension=''):
+       '''
+       Creates a unique file name based on the specified base name.
+
+       @base_name - The base name to use for the unique file name.
+       @extension - The file extension to use for the unique file name.
+
+       Returns a unique file string.
+       '''
+       idcount = 0
+       
+       if extension and not extension.startswith('.'):
+               extension = '.%s' % extension
+
+       fname = base_name + extension
+
+       while os.path.exists(fname):
+               fname = "%s-%d%s" % (base_name, idcount, extension)
+               idcount += 1
+
+       return fname
+
index cb1d72f..5c26254 100644 (file)
@@ -27,19 +27,22 @@ class Config:
        Valid file names under both the 'user' and 'system' keys are as follows:
 
                o BINWALK_MAGIC_FILE  - Path to the default binwalk magic file.
-               o BINCAST_MAGIC_FILE  - Path to the bincast magic file (used when -C is specified with the command line binwalk script)
-               o BINARCH_MAGIC_FILE  - Path to the binarch magic file (used when -A is specified with the command line binwalk script)
-               o EXTRACT_FILE        - Path to the extract configuration file (used when -e is specified with the command line binwalk script)
+               o BINCAST_MAGIC_FILE  - Path to the bincast magic file (used when -C is specified with the command line binwalk script).
+               o BINARCH_MAGIC_FILE  - Path to the binarch magic file (used when -A is specified with the command line binwalk script).
+               o EXTRACT_FILE        - Path to the extract configuration file (used when -e is specified with the command line binwalk script).
+               o PLUGINS             - Path to the plugins directory.
        '''
        # Release version
-       VERSION = "1.0"
+       VERSION = "1.2.1"
 
        # Sub directories
        BINWALK_USER_DIR = ".binwalk"
        BINWALK_MAGIC_DIR = "magic"
        BINWALK_CONFIG_DIR = "config"
+       BINWALK_PLUGINS_DIR = "plugins"
 
        # File names
+       PLUGINS = "plugins"
        EXTRACT_FILE = "extract.conf"
        BINWALK_MAGIC_FILE = "binwalk"
        BINCAST_MAGIC_FILE = "bincast"
@@ -61,16 +64,18 @@ class Config:
                }
 
                # Build the paths to all user-specific files
-               self.paths['user'][self.BINWALK_MAGIC_FILE] = self._user_file(self.BINWALK_MAGIC_DIR, self.BINWALK_MAGIC_FILE)
-               self.paths['user'][self.BINCAST_MAGIC_FILE] = self._user_file(self.BINWALK_MAGIC_DIR, self.BINCAST_MAGIC_FILE)
-               self.paths['user'][self.BINARCH_MAGIC_FILE] = self._user_file(self.BINWALK_MAGIC_DIR, self.BINARCH_MAGIC_FILE)
-               self.paths['user'][self.EXTRACT_FILE] = self._user_file(self.BINWALK_CONFIG_DIR, self.EXTRACT_FILE)
+               self.paths['user'][self.BINWALK_MAGIC_FILE] = self._user_path(self.BINWALK_MAGIC_DIR, self.BINWALK_MAGIC_FILE)
+               self.paths['user'][self.BINCAST_MAGIC_FILE] = self._user_path(self.BINWALK_MAGIC_DIR, self.BINCAST_MAGIC_FILE)
+               self.paths['user'][self.BINARCH_MAGIC_FILE] = self._user_path(self.BINWALK_MAGIC_DIR, self.BINARCH_MAGIC_FILE)
+               self.paths['user'][self.EXTRACT_FILE] = self._user_path(self.BINWALK_CONFIG_DIR, self.EXTRACT_FILE)
+               self.paths['user'][self.PLUGINS] = self._user_path(self.BINWALK_PLUGINS_DIR)
 
                # Build the paths to all system-wide files
-               self.paths['system'][self.BINWALK_MAGIC_FILE] = self._system_file(self.BINWALK_MAGIC_DIR, self.BINWALK_MAGIC_FILE)
-               self.paths['system'][self.BINCAST_MAGIC_FILE] = self._system_file(self.BINWALK_MAGIC_DIR, self.BINCAST_MAGIC_FILE)
-               self.paths['system'][self.BINARCH_MAGIC_FILE] = self._system_file(self.BINWALK_MAGIC_DIR, self.BINARCH_MAGIC_FILE)
-               self.paths['system'][self.EXTRACT_FILE] = self._system_file(self.BINWALK_CONFIG_DIR, self.EXTRACT_FILE)
+               self.paths['system'][self.BINWALK_MAGIC_FILE] = self._system_path(self.BINWALK_MAGIC_DIR, self.BINWALK_MAGIC_FILE)
+               self.paths['system'][self.BINCAST_MAGIC_FILE] = self._system_path(self.BINWALK_MAGIC_DIR, self.BINCAST_MAGIC_FILE)
+               self.paths['system'][self.BINARCH_MAGIC_FILE] = self._system_path(self.BINWALK_MAGIC_DIR, self.BINARCH_MAGIC_FILE)
+               self.paths['system'][self.EXTRACT_FILE] = self._system_path(self.BINWALK_CONFIG_DIR, self.EXTRACT_FILE)
+               self.paths['system'][self.PLUGINS] = self._system_path(self.BINWALK_PLUGINS_DIR)
        
        def _get_system_dir(self):
                '''
@@ -119,7 +124,7 @@ class Config:
 
                return fpath
 
-       def _user_file(self, subdir, basename):
+       def _user_path(self, subdir, basename=''):
                '''
                Gets the full path to the 'subdir/basename' file in the user binwalk directory.
 
@@ -130,7 +135,7 @@ class Config:
                '''
                return self._file_path(os.path.join(self.user_dir, self.BINWALK_USER_DIR, subdir), basename)
 
-       def _system_file(self, subdir, basename):
+       def _system_path(self, subdir, basename=''):
                '''
                Gets the full path to the 'subdir/basename' file in the system binwalk directory.
                
index 06888e7..3de0aab 100644 (file)
@@ -7,20 +7,23 @@
 #################################################################################################################
 
 # Assumes these utilities are installed in $PATH.
-gzip compressed data:gz:gzip -d -f '%e'
-lzma compressed data:7z:7zr e -y '%e'
-bzip2 compressed data:bz2:bzip2 -d -f '%e'
-zip archive data:zip:jar xf '%e' # jar does a better job of unzipping than unzip does...
-posix tar archive:tar:tar xvf '%e'
+^gzip compressed data:gz:gzip -d -f '%e'
+^lzma compressed data:7z:7zr e -y '%e'
+^bzip2 compressed data:bz2:bzip2 -d -f '%e'
+^zip archive data:zip:jar xf '%e' # jar does a better job of unzipping than unzip does...
+^posix tar archive:tar:tar xvf '%e'
+^rar archive data:rar:unrar e '%e'
+^arj archive data.*comment header:arj:arj e '%e'
 
 # These assume the firmware-mod-kit is installed to /opt/firmware-mod-kit.
 # If not, change the file paths appropriately.
-squashfs filesystem:squashfs:/opt/firmware-mod-kit/trunk/unsquashfs_all.sh '%e'
-jffs2 filesystem:jffs2:/opt/firmware-mod-kit/trunk/src/jffs2/unjffs2 '%e' # requires root
-cpio archive:cpio:/opt/firmware-mod-kit/trunk/uncpio.sh '%e'
+^squashfs filesystem:squashfs:/opt/firmware-mod-kit/trunk/unsquashfs_all.sh '%e'
+^jffs2 filesystem:jffs2:/opt/firmware-mod-kit/trunk/src/jffs2/unjffs2 '%e' # requires root
+^ascii cpio archive:cpio:/opt/firmware-mod-kit/trunk/uncpio.sh '%e'
+^cramfs filesystem:cramfs:/opt/firmware-mod-kit/trunk/uncramfs_all.sh '%e'
+^bff volume entry:bff:/opt/firmware-mod-kit/trunk/src/bff/bffxtractor.py '%e'
 
 # Extract, but don't run anything
-ext2 filesystem:ext2
-romfs filesystem:romfs
-cramfs filesystem:cramfs
-private key:key
+^ext2 filesystem:ext2
+^romfs filesystem:romfs
+^private key:key
diff --git a/binwalk/entropy.py b/binwalk/entropy.py
new file mode 100644 (file)
index 0000000..64b7337
--- /dev/null
@@ -0,0 +1,431 @@
+import zlib
+import math
+import os.path
+import plugins
+import common
+
+class FileEntropy(object):
+       '''
+       Class for analyzing and plotting data entropy for a file.
+       Preferred to use the Entropy class instead of calling FileEntropy directly.
+       '''
+
+       DEFAULT_BLOCK_SIZE = 1024
+       ENTROPY_TRIGGER = 0.9
+       ENTROPY_MAX = 0.95
+       FILE_FORMAT = 'svg'
+
+       def __init__(self, file_name=None, fd=None, binwalk=None, offset=0, length=None, block=DEFAULT_BLOCK_SIZE, plugins=None):
+               '''
+               Class constructor.
+
+               @file_name - The path to the file to analyze.
+               @fd        - A file object to analyze data from.
+               @binwalk   - An instance of the Binwalk class.
+               @offset    - The offset into the data to begin analysis.
+               @length    - The number of bytes to analyze.
+               @block     - The size of the data blocks to analyze.
+               @plugins   - Instance of the Plugins class.
+
+               Returns None.
+               '''
+               self.fd = fd
+               self.start = offset
+               self.length = length
+               self.block = block
+               self.binwalk = binwalk
+               self.plugins = plugins
+               self.total_read = 0
+               self.fd_open = False
+
+               if file_name is None and self.fd is None:
+                       raise Exception("Entropy.__init__ requires at least the file_name or fd options")
+
+               if self.fd is None:
+                       self.fd = open(file_name, 'rb')
+                       self.fd_open = True
+
+               if not self.length:
+                       self.length = None
+
+               if not self.start:
+                       self.start = 0
+
+               if not self.block:
+                       self.block = self.DEFAULT_BLOCK_SIZE
+                       
+               # Some file descriptors aren't seekable (stdin, for example)
+               try:
+                       self.fd.seek(self.start)
+               except:
+                       self.fd.read(self.start)
+
+               if self.binwalk:
+                       # Set the total_scanned and scan_length values for plugins and status display messages
+                       self.binwalk.total_scanned = 0
+                       if self.length:
+                               self.binwalk.scan_length = self.length
+                       else:
+                               self.binwalk.scan_length = common.file_size(self.fd.name) - self.start
+
+       def __enter__(self):
+               return self
+
+       def __del__(self):
+               self.cleanup()
+
+       def __exit__(self, t, v, traceback):
+               self.cleanup()
+
+       def cleanup(self):
+               '''
+               Clean up any open file objects.
+               Called internally by __del__ and __exit__.
+
+               Returns None.
+               '''
+               try:
+                       if self.fd_open:
+                               self.fd.close()
+               except:
+                       pass
+
+       def _read_block(self):
+               offset = self.total_read
+
+               if self.length is not None and (self.total_read + self.block) > self.length:
+                       read_size = self.length - self.total_read
+               else:
+                       read_size = self.block
+
+               data = self.fd.read(read_size)
+               dlen = len(data)
+
+               if self.binwalk:
+                       self.binwalk.total_scanned = self.total_read
+
+               self.total_read += dlen
+
+               return (dlen, data, offset+self.start)
+
+       def gzip(self, offset, data, truncate=True):
+               '''
+               Performs an entropy analysis based on zlib compression ratio.
+               This is the default analysis used as it is faster than the shannon entropy analysis
+               and produces basically the same data.
+               '''
+               # Entropy is a simple ratio of: <zlib compressed size> / <original size>
+               e = float(float(len(zlib.compress(data, 9))) / float(len(data)))
+
+               if truncate and e > 1.0:
+                       e = 1.0
+
+               return e
+
+       def shannon(self, offset, data):
+               '''
+               Performs a Shannon entropy analysis on a given block of data.
+               '''
+               entropy = 0
+               dlen = len(data)
+
+               if not data:
+                       return 0
+
+               for x in range(256):
+                       p_x = float(data.count(chr(x))) / dlen
+                       if p_x > 0:
+                               entropy += - p_x*math.log(p_x, 2)
+
+               return (entropy / 8)
+
+       def _do_analysis(self, algorithm):
+               '''
+               Performs an entropy analysis using the provided algorithm.
+
+               @algorithm - A function/method to call which returns an entropy value.
+
+               Returns a tuple of ([x-coordinates], [y-coordinates], average_entropy), where:
+
+                       o x-coordinates = A list of offsets analyzed inside the data.
+                       o y-coordinates = A corresponding list of entropy for each offset.
+               '''
+               offsets = []
+               entropy = []
+               average = 0
+               total = 0
+               self.total_read = 0
+               plug_ret = plugins.PLUGIN_CONTINUE
+               plug_pre_ret = plugins.PLUGIN_CONTINUE
+
+               if self.plugins:
+                       plug_pre_ret = self.plugins._pre_scan_callbacks(self.fd)
+
+               while not ((plug_pre_ret | plug_ret) & plugins.PLUGIN_TERMINATE):
+                       (dlen, data, offset) = self._read_block()
+                       if not dlen or not data:
+                               break
+
+                       e = algorithm(offset, data)
+
+                       results = {'description' : '%f' % e, 'offset' : offset}
+
+                       if self.plugins:
+                               plug_ret = self.plugins._scan_callbacks(results)
+                               offset = results['offset']
+                               e = float(results['description'])
+
+                       if not ((plug_pre_ret | plug_ret) & (plugins.PLUGIN_TERMINATE | plugins.PLUGIN_NO_DISPLAY)):
+                               if self.binwalk:
+                                       self.binwalk.display.results(offset, [results])
+
+                               entropy.append(e)
+                               offsets.append(offset)
+                               total += e
+
+               try:
+                       # This results in a divide by zero if one/all plugins returns PLUGIN_TERMINATE or PLUGIN_NO_DISPLAY,
+                       # or if the file being scanned is a zero-size file.
+                       average = float(float(total) / float(len(offsets)))
+               except:
+                       pass
+
+               if self.plugins:
+                       self.plugins._post_scan_callbacks(self.fd)
+       
+               return (offsets, entropy, average)
+
+       def analyze(self, algorithm=None):
+               '''
+               Performs an entropy analysis of the data using the specified algorithm.
+
+               @algorithm - A method inside of the Entropy class to invoke for entropy analysis.
+                            Default method: self.gzip.
+                            Other available methods: self.shannon.
+                            May also be a string: 'shannon'.
+
+               Returns the return value of algorithm.
+               '''
+               algo = self.gzip
+
+               if algorithm:
+                       if callable(algorithm):
+                               algo = algorithm
+
+                       try:
+                               if algorithm.lower() == 'shannon':
+                                       algo = self.shannon
+                       except:
+                               pass
+
+               data = self._do_analysis(algo)
+
+               return data
+       
+       def plot(self, x, y, average=0, file_results=[], show_legend=True, save=False):
+               '''
+               Plots entropy data.
+
+               @x            - List of graph x-coordinates (i.e., data offsets).
+               @y            - List of graph y-coordinates (i.e., entropy for each offset).
+               @average      - The average entropy.
+               @file_results - A list of tuples containing additional analysis data, as returned by Binwalk.single_scan.
+               @show_legend  - Set to False to not generate a color-coded legend and plotted x coordinates for the graph.
+               @save         - If set to True, graph will be saved to disk rather than displayed.
+
+               Returns None.
+               '''
+               import matplotlib.pyplot as plt
+               import numpy as np
+
+               i = 0
+               trigger = 0
+               new_ticks = []
+               colors = ['darkgreen', 'blueviolet', 'saddlebrown', 'deeppink', 'goldenrod', 'olive', 'black']
+               color_mappings = {}
+
+               plt.clf()
+
+               if not file_results and show_legend and average:
+                       file_results = []
+
+                       # Typically the average entropy is used as the trigger level for rising/falling entropy edges.
+                       # If the average entropy is too low, false rising and falling edges will be marked; if this is
+                       # the case, and if there is at least one data point greater than ENTROPY_MAX, use ENTROPY_TRIGGER
+                       # as the trigger level to avoid false edges.
+                       if average < self.ENTROPY_TRIGGER:
+                               for point in y:
+                                       if point > self.ENTROPY_MAX:
+                                               trigger = self.ENTROPY_TRIGGER
+                                               break
+
+                       if not trigger:
+                               trigger = average
+
+                       for j in range(0, len(x)):
+                               if j > 0:
+                                       if y[j] >= trigger and y[j-1] < trigger:
+                                               file_results.append((x[j], [{'description' : 'Entropy rising edge'}]))
+                                       elif y[j] <= trigger and y[j-1] > trigger:
+                                               file_results.append((x[j], [{'description' : 'Entropy falling edge'}]))
+
+               if file_results:
+                       for (offset, results) in file_results:
+                               label = None
+                               description = results[0]['description'].split(',')[0]
+
+                               if not color_mappings.has_key(description):
+                                       if show_legend:
+                                               label = description
+
+                                       color_mappings[description] = colors[i]
+                                       i += 1
+                                       if i >= len(colors):
+                                               i = 0
+                       
+                               plt.axvline(x=offset, label=label, color=color_mappings[description], linewidth=1.5)
+                               new_ticks.append(offset)
+
+                       if show_legend:
+                               plt.legend()
+
+                               if new_ticks:
+                                       new_ticks.sort()
+                                       plt.xticks(np.array(new_ticks), new_ticks)
+
+               plt.plot(x, y, linewidth=1.5)
+
+               if average:
+                       plt.plot(x, [average] * len(x), linestyle='--', color='r')
+
+               plt.xlabel('Offset')
+               plt.ylabel('Entropy')
+               plt.title(self.fd.name)
+               plt.ylim(0, 1.5)
+               if save:
+                       plt.savefig(common.unique_file_name(os.path.join(os.path.dirname(self.fd.name), '_' + os.path.basename(self.fd.name)), self.FILE_FORMAT))
+               else:
+                       plt.show()
+
+class Entropy(object):
+       '''
+       Class for analyzing and plotting data entropy for multiple files.
+
+       A simple example of performing a binwalk scan and overlaying the binwalk scan results on the
+       resulting entropy analysis graph:
+
+               import sys
+               import binwalk
+
+               bwalk = binwalk.Binwalk()
+               scan_results = bwalk.scan(sys.argv[1])
+
+                with binwalk.entropy.Entropy(scan_results, bwalk) as e:
+                        e.analyze()
+
+               bwalk.cleanup()
+       '''
+
+       DESCRIPTION = "ENTROPY"
+       ENTROPY_SCAN = 'entropy'
+
+       def __init__(self, files, binwalk=None, offset=0, length=0, block=0, plot=True, legend=True, save=False, algorithm=None, load_plugins=True, whitelist=[], blacklist=[]):
+               '''
+               Class constructor.
+
+               @files        - A dictionary containing file names and results data, as returned by Binwalk.scan.
+               @binwalk      - An instance of the Binwalk class.
+               @offset       - The offset into the data to begin analysis.
+               @length       - The number of bytes to analyze.
+               @block        - The size of the data blocks to analyze.
+               @plot         - Set to False to disable plotting.
+               @legend       - Set to False to exclude the legend and custom offset markers from the plot.
+               @save         - Set to True to save plots to disk instead of displaying them.
+               @algorithm    - Set to 'shannon' to use shannon entropy algorithm.
+               @load_plugins - Set to False to disable plugin callbacks.
+               @whitelist    - A list of whitelisted plugins.
+               @blacklist    - A list of blacklisted plugins.
+
+               Returns None.
+               '''
+               self.files = files
+               self.binwalk = binwalk
+               self.offset = offset
+               self.length = length
+               self.block = block
+               self.plot = plot
+               self.legend = legend
+               self.save = save
+               self.algorithm = algorithm
+               self.plugins = None
+               self.load_plugins = load_plugins
+               self.whitelist = whitelist
+               self.blacklist = blacklist
+
+               if len(self.files) > 1:
+                       self.save = True
+
+               if self.binwalk:
+                       self.binwalk.scan_type = self.binwalk.ENTROPY
+
+       def __enter__(self):
+               return self
+
+       def __exit__(self, t, v, traceback):
+               return None
+
+       def __del__(self):
+               return None
+
+       def set_entropy_algorithm(self, algorithm):
+               '''
+               Specify a function/method to call for determining data entropy.
+
+               @algorithm - The function/method to call. This will be  passed two arguments:
+                            the file offset of the data block, and a data block (type 'str').
+                            It must return a single floating point entropy value from 0.0 and 1.0, inclusive.
+
+               Returns None.
+               '''
+               self.algorithm = algorithm
+
+       def analyze(self):
+               '''
+               Perform an entropy analysis on the target files.
+
+               Returns a dictionary of:
+                       
+                       {
+                               'file_name' : ([list, of, offsets], [list, of, entropy], average_entropy)
+                       }
+               '''
+               results = {}
+
+               if self.binwalk and self.load_plugins:
+                       self.plugins = plugins.Plugins(self.binwalk, whitelist=self.whitelist, blacklist=self.blacklist)
+
+               for (file_name, overlay) in self.files.iteritems():
+
+                       if self.plugins:
+                               self.plugins._load_plugins()
+
+                       if self.binwalk:
+                               self.binwalk.display.header(file_name=file_name, description=self.DESCRIPTION)
+
+                       with FileEntropy(file_name=file_name, binwalk=self.binwalk, offset=self.offset, length=self.length, block=self.block, plugins=self.plugins) as e:
+                               (x, y, average) = e.analyze(self.algorithm)
+                               
+                               if self.plot or self.save:
+                                       e.plot(x, y, average, overlay, self.legend, self.save)
+                               
+                               results[file_name] = (x, y, average)
+
+                       if self.binwalk:
+                               self.binwalk.display.footer()
+
+               if self.plugins:
+                       del self.plugins
+                       self.plugins = None
+
+               return results
+
index fdcd511..f0573c3 100644 (file)
@@ -1,10 +1,11 @@
 import os
+import re
 import sys
 import shlex
 import tempfile
 import subprocess
 from config import *
-from common import file_size
+from common import file_size, unique_file_name
 
 class Extractor:
        '''
@@ -20,7 +21,7 @@ class Extractor:
                # Create extraction rules for scan results containing the string 'gzip compressed data' and 'filesystem'.
                # The former will be saved to disk with a file extension of 'gz' and the command 'gunzip <file name on disk>' will be executed (note the %e placeholder).
                # The latter will be saved to disk with a file extension of 'fs' and no command will be executed.
-               # These rules will take precedence over subsequent rules with the same match string.
+               # These rules will be ignored if there were previous rules with the same match string.
                bw.extractor.add_rule(['gzip compressed data:gz:gunzip %e', 'filesystem:fs'])
 
                # Load the extraction rules from the default extract.conf file(s).
@@ -40,6 +41,9 @@ class Extractor:
        # Place holder for the extracted file name in the command 
        FILE_NAME_PLACEHOLDER = '%e'
 
+       # Max size of data to read/write at one time when extracting data
+       MAX_READ_SIZE = 10 * 1024 * 1024
+
        def __init__(self, verbose=False):
                '''
                Class constructor.
@@ -54,46 +58,98 @@ class Extractor:
                self.verbose = verbose
                self.extract_rules = {}
                self.remove_after_execute = False
+               self.extract_path = os.getcwd()
 
-       def add_rule(self, rule):
+       def add_rule(self, txtrule=None, regex=None, extension=None, cmd=None):
                '''
                Adds a set of rules to the extraction rule list.
 
-               @rule - Rule string, or list of rule strings, in the format <case insensitive matching string>:<file extension>[:<command to run>]
+               @txtrule   - Rule string, or list of rule strings, in the format <regular expression>:<file extension>[:<command to run>]
+               @regex     - If rule string is not specified, this is the regular expression string to use.
+               @extension - If rule string is not specified, this is the file extension to use.
+               @cmd       - If rule string is not specified, this is the command to run.
+                            Alternatively a callable object may be specified, which will be passed one argument: the path to the file to extract.
 
                Returns None.
                '''
+               rules = []
+               match = False
                r = {
                        'extension'     : '',
-                       'cmd'           : ''
+                       'cmd'           : '',
+                       'regex'         : None
                }
 
-               if type(rule) != type([]):
-                       rules = [rule]
-               else:
-                       rules = rule
+               if not txtrule and regex and extension:
+                       txtrule = '%s:%s' % (regex, extension)
+                       if cmd:
+                               txtrule += ':%s' % cmd
 
+               if not isinstance(txtrule, type([])):
+                       rules = [txtrule]
+               else:
+                       rules = txtrule
+               
                for rule in rules:
                        r['cmd'] = ''
                        r['extension'] = ''
 
                        try:
                                values = self._parse_rule(rule)
-                               match = values[0].lower()
+                               match = values[0]
+                               r['regex'] = re.compile(values[0])
                                r['extension'] = values[1]
                                r['cmd'] = values[2]
                        except:
                                pass
 
+                       if not match and regex and extension:
+                               match = regex
+                               r['regex'] = re.compile(regex)
+                               r['extension'] = extension
+                               r['cmd'] = cmd
+
                        # Verify that the match string and file extension were retrieved.
                        # Only add the rule if it is a new one (first come, first served).
                        if match and r['extension'] and not self.extract_rules.has_key(match):
                                self.extract_rules[match] = {}
                                self.extract_rules[match]['cmd'] = r['cmd']
                                self.extract_rules[match]['extension'] = r['extension']
+                               self.extract_rules[match]['regex'] = r['regex']
                                # Once any rule is added, set self.enabled to True
                                self.enabled = True
 
+       def remove_rule(self, text):
+               '''
+               Remove all rules that match a specified text.
+
+               @text - The text to match against.
+
+               Returns the number of rules removed.
+               '''
+               i = 0
+
+               for key in self.extract_rules.keys():
+                       if self.extract_rules[key]['regex'].match(text):
+                               del self.extract_rules[key]
+                               i += 1
+               return i
+
+       def clear_rules(self):
+               '''
+               Deletes all extraction rules.
+
+               Returns None.
+               '''
+               self.extract_rules = {}
+               self.enabled = False
+
+       def get_rules(self):
+               '''
+               Returns a dictionary of all extraction rules.
+               '''
+               return self.extract_rules
+
        def enable_delayed_extract(self, tf=None):
                '''
                Enables / disables the delayed extraction feature.
@@ -142,6 +198,16 @@ class Extractor:
                                if self.verbose:
                                        raise Exception("Extractor.load_defaults failed to load file '%s': %s" % (extract_file, str(e)))
 
+       def output_directory(self, path):
+               '''
+               Set the output directory for extracted files.
+
+               @path - The extraction path.
+
+               Returns None.
+               '''
+               self.extract_path = path
+
        def cleanup_extracted_files(self, tf=None):
                '''
                Set the action to take after a file is extracted.
@@ -170,45 +236,57 @@ class Extractor:
 
                Returns the name of the extracted file (blank string if nothing was extracted).
                '''
+               fname = ''
                cleanup_extracted_fname = True
+               original_dir = os.getcwd()
 
-               rule = self._match(description)
-               if rule is not None:
-                       fname = self._dd(file_name, offset, size, rule['extension'], output_file_name=name)
-                       if rule['cmd']:
-
-                               # Many extraction utilities will extract the file to a new file, just without
-                               # the file extension (i.e., myfile.7z => myfile). If the presumed resulting
-                               # file name already exists before executing the extract command, do not attempt 
-                               # to clean it up even if its resulting file size is 0.
-                               if self.remove_after_execute:
-                                       extracted_fname = os.path.splitext(fname)[0]
-                                       if os.path.exists(extracted_fname):
-                                               cleanup_extracted_fname = False
-
-                               # Execute the specified command against the extracted file
-                               self._execute(rule['cmd'], fname)
+               if not os.path.exists(self.extract_path):
+                       os.mkdir(self.extract_path)
 
-                               # Only clean up files if remove_after_execute was specified                             
-                               if self.remove_after_execute:
-
-                                       # Remove the original file that we extracted
-                                       try:
-                                               os.unlink(fname)
-                                       except:
-                                               pass
+               file_path = os.path.realpath(file_name)
+               
+               if os.path.isfile(file_path):
+                       os.chdir(self.extract_path)
+
+                       rule = self._match(description)
+                       if rule is not None:
+                               fname = self._dd(file_path, offset, size, rule['extension'], output_file_name=name)
+                               if rule['cmd']:
+
+                                       # Many extraction utilities will extract the file to a new file, just without
+                                       # the file extension (i.e., myfile.7z -> myfile). If the presumed resulting
+                                       # file name already exists before executing the extract command, do not attempt 
+                                       # to clean it up even if its resulting file size is 0.
+                                       if self.remove_after_execute:
+                                               extracted_fname = os.path.splitext(fname)[0]
+                                               if os.path.exists(extracted_fname):
+                                                       cleanup_extracted_fname = False
+       
+                                       # Execute the specified command against the extracted file
+                                       self._execute(rule['cmd'], fname)
 
-                                       # If the command worked, assume it removed the file extension from the extracted file
+                                       # Only clean up files if remove_after_execute was specified                             
+                                       if self.remove_after_execute:
 
-                                       # If the extracted file name file exists and is empty, remove it
-                                       if cleanup_extracted_fname and os.path.exists(extracted_fname) and file_size(extracted_fname) == 0:
+                                               # Remove the original file that we extracted
                                                try:
-                                                       os.unlink(extracted_fname)
+                                                       os.unlink(fname)
                                                except:
                                                        pass
-               else:
-                       fname = ''
 
+                                               # If the command worked, assume it removed the file extension from the extracted file
+
+                                               # If the extracted file name file exists and is empty, remove it
+                                               if cleanup_extracted_fname and os.path.exists(extracted_fname) and file_size(extracted_fname) == 0:
+                                                       try:
+                                                               os.unlink(extracted_fname)
+                                                       except:
+                                                               pass
+       
+                               fname = os.path.join(self.extract_path, fname)
+
+                       os.chdir(original_dir)
+       
                return fname
 
        def delayed_extract(self, results, file_name, size):
@@ -280,7 +358,7 @@ class Extractor:
                description = description.lower()
 
                for (m, rule) in self.extract_rules.iteritems():
-                       if m in description:
+                       if rule['regex'].search(description):
                                return rule
                return None
 
@@ -306,23 +384,23 @@ class Extractor:
 
                Returns the extracted file name.
                '''
-               # Default extracted file name is <hex offset>.<extension>
-               altname = "%X.%s" % (offset, extension)
-               
+               total_size = 0
+
                if not output_file_name or output_file_name is None:
-                       fname = altname
+                       # Default extracted file name is <hex offset>.<extension>
+                       bname = "%X" % offset
                else:
-                       fname = "%s.%s" % (output_file_name, extension)
-       
-               # Sanitize output file name of invalid/dangerous characters (like file paths)   
-               fname = os.path.basename(fname)
+                       # Strip the output file name of invalid/dangerous characters (like file paths)  
+                       bname = os.path.basename(output_file_name)
+               
+               fname = unique_file_name(bname, extension)
 
                try:
                        # Open the target file and seek to the offset
                        fdin = open(file_name, "rb")
                        fdin.seek(offset)
                        
-                       # Open the extracted file
+                       # Open the output file
                        try:
                                fdout = open(fname, "wb")
                        except:
@@ -330,8 +408,14 @@ class Extractor:
                                fname = altname
                                fdout = open(fname, "wb")
 
-                       # Read size bytes from target file and write it to the extracted file
-                       fdout.write(fdin.read(size))
+                       # Read data from target file in chunks and write it to the extracted file
+                       while total_size < size:
+                               block_size = size - total_size
+                               if block_size > self.MAX_READ_SIZE:
+                                       block_size = self.MAX_READ_SIZE
+                       
+                               fdout.write(fdin.read(block_size))
+                               total_size += block_size
 
                        # Cleanup
                        fdout.close()
@@ -352,16 +436,19 @@ class Extractor:
                '''
                tmp = None
 
-               # If not in verbose mode, create a temporary file to redirect stdout and stderr to
-               if not self.verbose:
-                       tmp = tempfile.TemporaryFile()
-
                try:
-                       # Replace all instances of FILE_NAME_PLACEHOLDER in the command with fname
-                       cmd = cmd.replace(self.FILE_NAME_PLACEHOLDER, fname)
-
-                       # Execute.
-                       subprocess.call(shlex.split(cmd), stdout=tmp, stderr=tmp)
+                       if callable(cmd):
+                               cmd(fname)
+                       else:
+                               # If not in verbose mode, create a temporary file to redirect stdout and stderr to
+                               if not self.verbose:
+                                       tmp = tempfile.TemporaryFile()
+
+                               # Replace all instances of FILE_NAME_PLACEHOLDER in the command with fname
+                               cmd = cmd.replace(self.FILE_NAME_PLACEHOLDER, fname)
+       
+                               # Execute.
+                               subprocess.call(shlex.split(cmd), stdout=tmp, stderr=tmp)
                except Exception, e:
                        sys.stderr.write("WARNING: Extractor.execute failed to run '%s': %s\n" % (cmd, str(e)))
                
index 5192cd9..a65e535 100644 (file)
@@ -1,12 +1,14 @@
+import re
 import common
-from smartsig import SmartSignature
+from smartsignature import SmartSignature
 
 class MagicFilter:
        '''
        Class to filter libmagic results based on include/exclude rules and false positive detection.
        An instance of this class is available via the Binwalk.filter object.
+       Note that all filter strings should be in lower case.
 
-       Example code which creates include, exclude, and grep filters before running a Binwalk scan:
+       Example code which creates include, exclude, and grep filters before running a binwalk scan:
 
                import binwalk
 
@@ -58,7 +60,7 @@ class MagicFilter:
                Adds a new filter which explicitly includes results that contain
                the specified matching text.
 
-               @match     - Case insensitive text, or list of texts, to match.
+               @match     - Regex, or list of regexs, to match.
                @exclusive - If True, then results that do not explicitly contain
                             a FILTER_INCLUDE match will be excluded. If False,
                             signatures that contain the FILTER_INCLUDE match will
@@ -67,22 +69,21 @@ class MagicFilter:
                
                Returns None.
                '''
-               include_filter = {
-                               'type'          : self.FILTER_INCLUDE,
-                               'filter'        : ''
-               }
-
-               if type(match) != type([]):
+               if not isinstance(match, type([])):
                        matches = [match]
                else:
                        matches = match
 
                for m in matches:
+                       include_filter = {}
+
                        if m:
                                if exclusive and not self.exclusive_filter:
                                        self.exclusive_filter = True
 
-                               include_filter['filter'] = m.lower()
+                               include_filter['type'] = self.FILTER_INCLUDE
+                               include_filter['filter'] = m
+                               include_filter['regex'] = re.compile(m)
                                self.filters.append(include_filter)
 
        def exclude(self, match):
@@ -90,23 +91,22 @@ class MagicFilter:
                Adds a new filter which explicitly excludes results that contain
                the specified matching text.
 
-               @match - Case insensitive text, or list of texts, to match.
+               @match - Regex, or list of regexs, to match.
                
                Returns None.
                '''
-               exclude_filter = {
-                               'type'          : self.FILTER_EXCLUDE,
-                               'filter'        : ''
-               }
-
-               if type(match) != type([]):
+               if not isinstance(match, type([])):
                        matches = [match]
                else:
                        matches = match
 
                for m in matches:
+                       exclude_filter = {}
+
                        if m:
-                               exclude_filter['filter'] = m.lower()
+                               exclude_filter['type'] = self.FILTER_EXCLUDE
+                               exclude_filter['filter'] = m
+                               exclude_filter['regex'] = re.compile(m)
                                self.filters.append(exclude_filter)
 
        def filter(self, data):
@@ -124,7 +124,7 @@ class MagicFilter:
                # Loop through the filters to see if any of them are a match. 
                # If so, return the registered type for the matching filter (FILTER_INCLUDE | FILTER_EXCLUDE). 
                for f in self.filters:
-                       if f['filter'] in data:
+                       if f['regex'].search(data):
                                return f['type']
 
                # If there was not explicit match and exclusive filtering is enabled, return FILTER_EXCLUDE.
@@ -166,7 +166,7 @@ class MagicFilter:
                Add or check case-insensitive grep filters against the supplied data string.
 
                @data    - Data string to check grep filters against. Not required if filters is specified.
-               @filters - Filter, or list of filters, to add to the grep filters list. Not required if data is specified.
+               @filters - Regex, or list of regexs, to add to the grep filters list. Not required if data is specified.
 
                Returns None if data is not specified.
                If data is specified, returns True if the data contains a grep filter, or if no grep filters exist.
@@ -174,14 +174,14 @@ class MagicFilter:
                '''
                # Add any specified filters to self.grep_filters
                if filters:
-                       if type(filters) != type([]):
+                       if not isinstance(filters, type([])):
                                gfilters = [filters]
                        else:
                                gfilters = filters
 
                        for gfilter in gfilters:
                                # Filters are case insensitive
-                               self.grep_filters.append(gfilter.lower())
+                               self.grep_filters.append(re.compile(gfilter))
 
                # Check the data against all grep filters until one is found
                if data is not None:
@@ -194,7 +194,7 @@ class MagicFilter:
 
                        # If a filter exists in data, return True
                        for gfilter in self.grep_filters:
-                               if gfilter in data:
+                               if gfilter.search(data):
                                        return True
 
                        # Else, return False
index 4a5b60a..717bfc1 100644 (file)
@@ -1,30 +1,36 @@
 # MIPS prologue
 # addiu $sp, -XX
 # 27 BD FF XX
-1      string  \377\275\47     MIPSEL function prologue
-0       string  \47\275\377    MIPS function prologue
+0      string  \377\275\47     MIPSEL instructions, function prologue{offset-adjust:-1}
+0       string  \47\275\377    MIPS instructions, function prologue
 
 # MIPS epilogue
 # jr $ra
-0      belong  0x03e00008      MIPS function epilogue
-0      lelong  0x03e00008      MIPSEL function epilogue
+0      belong  0x03e00008      MIPS instructions, function epilogue
+0      lelong  0x03e00008      MIPSEL instructions, function epilogue
+
+# PowerPC prologue
+# mflr r0
+0      belong 0x7C0802A6       PowerPC big endian instructions, function prologue
+0      lelong 0x7C0802A6       PowerPC little endian instructions, funciton prologue
 
 # PowerPC epilogue
 # blr
-0      belong 0x4E800020       PowerPC big endian function epilogue
-0      lelong 0x4E800020       PowerPC little endian function epilogue
+0      belong 0x4E800020       PowerPC big endian instructions, function epilogue
+0      lelong 0x4E800020       PowerPC little endian instructions, function epilogue
 
 # ARM prologue
 # STMFD SP!, {XX}
-0      beshort 0xE92D          ARMEB function prologue
-2      leshort 0xE92D          ARM function prologue
+0      beshort 0xE92D          ARMEB instructions, function prologue
+0      leshort 0xE92D          ARM instructions, function prologue{offset-adjust:-2}
 
 # ARM epilogue
 # LDMFD SP!, {XX}
-0      beshort 0xE8BD          ARMEB function epilogue
-2      leshort 0xE8BD          ARM function epilogue
+0      beshort 0xE8BD          ARMEB instructions, function epilogue
+0      leshort 0xE8BD          ARM instructions, function epilogue{offset-adjust:-2}
 
 # x86 epilogue
 # push ebp
 # move ebp, esp
-0      string  \x55\x89\xE5    Intel x86 function epilogue
+0      string  \x55\x89\xE5    Intel x86 instructions, function epilogue
+
index 3d3ec27..9b7811d 100644 (file)
@@ -1,5 +1,7 @@
 0      belong x        Hex:                 0x%.8X
 #0     string x        String:              %s
+0      lequad x        Little Endian Quad:  %lld
+0      bequad x        Big Endian Quad:     %lld
 0      lelong x        Little Endian Long:  %d
 0      belong x        Big Endian Long:     %d
 0      leshort x       Little Endian Short: %d
index 5f2e183..bf5ad01 100644 (file)
@@ -8,14 +8,77 @@
 0       string  \x1aJar\x1b JAR (ARJ Software, Inc.) archive data{offset-adjust:-14}
 0       string  JARCS JAR (ARJ Software, Inc.) archive data
 
+# ZIP compression (Greg Roelofs, c/o zip-bugs@wkuvx1.wku.edu)
+0       string          PK\003\004      Zip
+>6     leshort         &0x01           encrypted
+>0     byte            x               archive data,
+>4      byte            0x00            v0.0
+>4      byte            0x09            at least v0.9 to extract,
+>4      byte            0x0a            at least v1.0 to extract,
+>4      byte            0x0b            at least v1.1 to extract,
+>0x161  string          WINZIP          WinZIP self-extracting,
+>4      byte            0x14
+>>30    ubelong         !0x6d696d65     at least v2.0 to extract,
+>18    lelong          !0
+>>18   lelong          <0              invalid
+>>18   lelong          x               compressed size: %d,
+>>18   lelong          x               {jump-to-offset:%d}
+>22    lelong          !0
+>>22   lelong          <0              invalid
+>>22   lelong          x               uncompressed size: %d,{extract-delay:End of Zip archive}
+>30    string          x               {file-name:{raw-replace}}name: {raw-replace}
+>26    leshort         x               {raw-string-length:%d}
+>30    string          x               {raw-string:%s
+>61    string          x               \b%s
+>92    string          x               \b%s
+>123   string          x               \b%s
+>154   string          x               \b%s}
+
+# ZIP footer
+0      string          PK\x05\x06      End of Zip archive
+#>10   leshort         x               number of records: %d,
+#>12   leshort         x               size of central directory: %d
+#>20   leshort         x               {offset-adjust:22+%d}
+>20    leshort         >0
+>>20   leshort         x               \b, comment: {raw-replace}
+>>20   leshort         x               {raw-string-length:%d}
+>>22   string          x               {raw-string:%s}
 
 # ARJ archiver (jason@jarthur.Claremont.EDU)
-0       leshort         0xea60          ARJ archive data
->5      byte            x               \b, v%d,
+0       leshort         0xea60          ARJ archive data,
+>2     leshort         x               header size: %d,
+>5     byte            <1              invalid
+>5     byte            >16             invalid
+>5      byte            x               version %d,
+>6     byte            <1              invalid
+>6     byte            >16             invalid
+>6     byte            x               minimum version to extract: %d,
+>8     byte            <0              invalid flags,
 >8      byte            &0x04           multi-volume,
 >8      byte            &0x10           slash-switched,
 >8      byte            &0x20           backup,
->34     string          x               original name: "%s",
+>9     byte            <0              invalid compression method,
+>9     byte            >4              invalid compression method,
+>9     byte            0               compression method: stored,
+>9     byte            1               compression method: compressed most,
+>9     byte            2               compression method: compressed,
+>9     byte            3               compression method: compressed faster,
+>9     byte            4               compression method: compressed fastest,
+>10    byte            <0              invalid file type
+>10    byte            >4              invalid file type
+>10    byte            0               file type: binary,
+>10    byte            1               file type: 7-bit text,
+>10    byte            2               file type: comment header,
+>10    byte            3               file type: directory,
+>10    byte            4               file type: volume label,
+>34    byte            !0
+>>34   string          x               {file-name:%s}
+>>34    string          x               original name: "%s",
+>0xC   ledate          x               original file date: %s,
+>0x10  lelong          <0              invalid
+>0x10  lelong          x               compressed file size: %d,
+>0x14  lelong          <0              invalid
+>0x14  lelong          x               uncompressed file size: %d,
 >7      byte            0               os: MS-DOS 
 >7      byte            1               os: PRIMOS
 >7      byte            2               os: Unix
@@ -26,7 +89,8 @@
 >7      byte            7               os: Atari ST
 >7      byte            8               os: NeXT
 >7      byte            9               os: VAX/VMS
->3      byte            >0              %d]
+>7     byte            >9              invalid os
+>7     byte            <0              invalid os
 
 # RAR archiver (Greg Roelofs, newt@uchicago.edu)
 0      string          Rar!            RAR archive data
 
 0       string          070701          ASCII cpio archive (SVR4 with no CRC),
 >110   byte            0               invalid
+>110   byte            !0x2F
+>>110  string          !TRAILER!!!     invalid
 >110   string          x               file name: "%s"
 >54    string          x               {jump-to-offset:0x%.8s+112}
 
 0       string          070702          ASCII cpio archive (SVR4 with CRC)
 >110   byte            0               invalid
+>110   byte            !0x2F
+>>110  string          !TRAILER!!!     invalid
 >110   string          x               file name: "%s"
 >54    string          x               {jump-to-offset:0x%.8s+112}
 
 >8     beshort         4               MIPS
 >8     beshort         5               PowerPC
 >8     beshort         6               68000
->8     beshort         7       SGI
+>8     beshort         7               SGI
 >8     beshort         8               RS6000
 >8     beshort         9               IA64
 >8     beshort         10              Sparc64
 >8     beshort         12              ARM
 >10    string          x               "%s"
 
+# IBM AIX Backup File Format header and entry signatures
+0      lelong  0xea6b0009      BFF volume header,
+>4     leshort x               checksum: 0x%.4X,
+>6     leshort <0              invalid
+>6     leshort 0               invalid
+>6     leshort x               volume number: %d,
+>8     ledate  x               current date: %s,
+>12    ledate  x               starting date: %s,
+>20    string  x               disk name: "%s",
+>36    string  x               file system name: "%s",
+>52    string  x               user name: "%s"
+
+0      leshort 0xea6b          BFF volume entry,{offset-adjust:-2}
+>22    lelong  <0              invalid
+>22    lelong  0               directory,
+>22    lelong  >0
+>>22   lelong  x               file size: %d,
+>>54   lelong  <0              invalid
+>>54   lelong  0               invalid
+>>54   lelong  x               compressed size: %d,
+>58    lelong  !0              invalid
+>62    byte    0               invalid
+>62    byte    !0x2e
+>>62   byte    !0x2f           invalid
+>62    string  x               file name: "%s
+>92    string  x               \b%s"
+
+0      leshort 0xea6c          BFF volume entry, compressed,{offset-adjust:-2}
+>22    lelong  <0              invalid
+>22    lelong  0               directory,
+>22    lelong  >0
+>>22   lelong  x               file size: %d,
+>>54   lelong  <0              invalid
+>>54   lelong  0               invalid
+>>54   lelong  x               compressed size: %d,
+>58    lelong  !0              invalid
+>62    byte    0               invalid
+>62    byte    !0x2e
+>>62   byte    !0x2f           invalid
+>62    string  x               file name: "%s
+>92    string  x               \b%s"
+
+0      leshort 0xea6d          BFF volume entry, AIXv3,{offset-adjust:-2}
+>22    lelong  <0              invalid
+>22    lelong  0               directory,
+>22    lelong  >0
+>>22   lelong  x               file size: %d,
+>>54   lelong  <0              invalid
+>>54   lelong  0               invalid
+>>54   lelong  x               compressed size: %d,
+>58    lelong  !0              invalid
+>62    byte    0               invalid
+>62    byte    !0x2e
+>>62   byte    !0x2f           invalid
+>62    string  x               file name: "%s
+>92    string  x               \b%s"
+
 
 #---------------------------Bootloaders--------------------------------
 
 # CFE bootloader
-0      string  CFE1CFE1        CFE boot loader, little endian
-0      string  1EFC1EFC        CFE boot loader, big endian
+0      string  CFE1CFE1        CFE boot loader
+>40    string  CFE1CFE1        invalid
 
+# U-Boot boot loader
+0      string  U-Boot          U-Boot boot loader reference{one-of-many}
+0      string  U-BOOT          U-Boot boot loader reference{one-of-many}
+0      string  u-boot          U-Boot boot loader reference{one-of-many}
 
 #------------------Compression Formats-----------------------------
 
 # http://www.7-zip.org or DOC/7zFormat.txt 
 #
 0       string          7z\274\257\047\034      7-zip archive data,
+>6     byte            <0                      invalid
+>6     byte            0                       invalid
+>6     byte            >20                     invalid
 >6      byte            x                       version %d
 >7      byte            x                       \b.%d
 
 # standard unix compress
-0       beshort                0x1f9d          compress'd data
->2      byte&0x80       >0              block compressed
->2     byte&0x1f       !16             invalid
->2      byte&0x1f       x               %d bits
+#0       beshort               0x1f9d          compress'd data
+#>2      byte&0x80       >0              block compressed
+#>2    byte&0x1f       !16             invalid
+#>2      byte&0x1f       x               %d bits
 
 # http://tukaani.org/xz/xz-file-format.txt
 0      string          \xFD\x37\x7a\x58\x5a\x00        xz compressed data
 #       * Original filename is only at offset 10 if "extra field" absent
 #       * Produce shorter output - notably, only report compression methods
 #         other than 8 ("deflate", the only method defined in RFC 1952).
-0       string          \037\213\x08    gzip compressed data
+#0       string          \037\213\x08    gzip compressed data
+0      string          \x1f\x8b\x08    gzip compressed data
 >3      byte            &0x01           \b, ASCII
 >3      byte            &0x02           \b, has CRC
 >3      byte            &0x04           \b, extra field
 >9      byte            =0x0B           \b, from NTFS filesystem (NT)
 >9      byte            =0x0C           \b, from QDOS
 >9      byte            =0x0D           \b, from Acorn RISCOS
->9     byte            >0x0D           \b, invalid source
->9     byte            <0              \b, invalid source
+#>9    byte            =0xFF           \b, from ZyNOS
+#>9    byte            >0x0D           \b, invalid
+#>>9   byte            x               source: 0x%.2X
+#>9    byte            <0              \b, invalid
+#>>9   byte            x               source: 0x%.2X
 >3      byte            &0x20           \b, encrypted (invalid)
 # Dates before 1992 are invalid, unless of course you're DD-WRT in which
 # case you don't know how to set a date in your gzip files. Brilliant.
 >8      byte            4               \b, max speed
 
 # Zlib signatures
-0      beshort         0x789C          zlib compressed data
-0      beshort         0x78DA          zlib compressed data
-0      beshort         0x7801          zlib compressed data
+# Useless until they can be further improved.
+#0     beshort         0x789C          zlib compressed data
+#0     beshort         0x78DA          zlib compressed data
+#0     beshort         0x7801          zlib compressed data
 
 # Supplementary magic data for the file(1) command to support
 # rzip(1).  The format is described in magic(5).
 >5      byte            x               \b.%d
 >6      belong          x               (%d bytes)
 
-# ZIP compression (Greg Roelofs, c/o zip-bugs@wkuvx1.wku.edu)
-0       string          PK\003\004      Zip archive data,
->4      byte            0x00            v0.0
->4      byte            0x09            at least v0.9 to extract,
->4      byte            0x0a            at least v1.0 to extract,
->4      byte            0x0b            at least v1.1 to extract,
->0x161  string          WINZIP          WinZIP self-extracting,
->4      byte            0x14
->>30    ubelong         !0x6d696d65     at least v2.0 to extract,
->18    lelong          !0
->>18   lelong          <0              invalid
->>18   lelong          x               compressed size: %d,
->22    lelong          !0
->>22   lelong          <0              invalid
->>22   lelong          x               uncompressed size: %d,{extract-delay:end of zip archive}
->30    string          x               {file-name:{raw-replace}}name: {raw-replace}
->26    leshort         x               {raw-string-length:%d}
->30    string          x               {raw-string:%s
->61    string          x               \b%s
->92    string          x               \b%s
->123   string          x               \b%s
->154   string          x               \b%s}
-
-# ZIP footer
-0      string          PK\x05\x06      End of Zip archive
->20    leshort         x               {offset-adjust:22+%d}
->20    leshort         >0
->>20   leshort         x               \b, comment: {raw-replace}
->>20   leshort         x               {raw-string-length:%d}
->>22   string          x               {raw-string:%s}
-
 # New LZMA format signature
 0      string          \xFFLZMA\x00    LZMA compressed data (new),
 >6     byte&0x10       0               single-block stream
 # Type: OpenSSL certificates/key files
 # From: Nicolas Collignon <tsointsoin@gmail.com>
 
-0       string  -----BEGIN\x20CERTIFICATE-----    PEM certificate
+0       string  -----BEGIN\x20CERTIFICATE-----      PEM certificate
 0       string  -----BEGIN\x20CERTIFICATE\x20REQ    PEM certificate request
 0       string  -----BEGIN\x20RSA\x20PRIVATE        PEM RSA private key
 0       string  -----BEGIN\x20DSA\x20PRIVATE        PEM DSA private key
 # The format is very similar to pgp
 0      string          \001gpg                 GPG key trust database
 >4     byte            x                       version %d
-0       beshort         0x9901                  GPG key public ring
+
+# Not a very useful signature
+#0       beshort         0x9901                  GPG key public ring
+
 # This magic is not particularly good, as the keyrings don't have true
 # magic. Nevertheless, it covers many keyrings.
 
 #0       beshort         0x9501                  PGP key security ring
 #0       beshort         0x9500                  PGP key security ring
 #0     beshort         0xa600                  PGP encrypted data
-0       string          -----BEGIN\040PGP       PGP armored data
+0       string          -----BEGIN\040PGP       PGP armored data,
 >15     string          PUBLIC\040KEY\040BLOCK- public key block
 >15     string          MESSAGE-                message
 >15     string          SIGNED\040MESSAGE-      signed message
 #
 # many of the compressed formats were extraced from IDARC 1.23 source code
 #
-0       string  MZ     Microsoft
->0x18  leshort <0x40 MS-DOS executable
->0 string MZ\0\0\0\0\0\0\0\0\0\0PE\0\0 \b, PE for MS Windows
+
+# Not a very useful signature...
+#0       string  MZ    Microsoft
+#>0x18  leshort <0x40 MS-DOS executable
+
+0 string MZ\0\0\0\0\0\0\0\0\0\0PE\0\0  Microsoft PE for MS Windows
 >>&18   leshort&0x2000  >0      (DLL)
 >>&88   leshort         0       (unknown subsystem)
 >>&88   leshort         1       (native)
 >>36    belong&0x8      0x8             gzdata
 
 
-#-----------------------------------------------------------------
-# MIPS COFF file formats
-#
-0       beshort 0x0160          MIPSEB ECOFF executable
->20     beshort 0407            (impure)
->20     beshort 0410            (swapped)
->20     beshort 0413            (paged)
->8      belong  >0              not stripped
->8      belong  0               stripped
->22     byte    x               - version %ld
->23     byte    x               \b.%ld
-#
-0       beshort 0x0162          MIPSEL-BE ECOFF executable
->20     beshort 0407            (impure)
->20     beshort 0410            (swapped)
->20     beshort 0413            (paged)
->8      belong  >0              not stripped
->8      belong  0               stripped
->23     byte    x               - version %d
->22     byte    x               \b.%ld
-#
-0       beshort 0x6001          MIPSEB-LE ECOFF executable
->20     beshort 03401           (impure)
->20     beshort 04001           (swapped)
->20     beshort 05401           (paged)
->8      belong  >0              not stripped
->8      belong  0               stripped
->23     byte    x               - version %d
->22     byte    x               \b.%ld
-#
-0       beshort 0x6201          MIPSEL ECOFF executable
->20     beshort 03401           (impure)
->20     beshort 04001           (swapped)
->20     beshort 05401           (paged)
->8      belong  >0              not stripped
->8      belong  0               stripped
->23     byte    x               - version %ld
->22     byte    x               \b.%ld
-# MIPS 2 additions
-#
-0       beshort 0x0163          MIPSEB MIPS-II ECOFF executable
->20     beshort 0407            (impure)
->20     beshort 0410            (swapped)
->20     beshort 0413            (paged)
->8      belong  >0              not stripped
->8      belong  0               stripped
->22     byte    x               - version %ld
->23     byte    x               \b.%ld
-#
-0       beshort 0x0166          MIPSEL-BE MIPS-II ECOFF executable
->20     beshort 0407            (impure)
->20     beshort 0410            (swapped)
->20     beshort 0413            (paged)
->8      belong  >0              not stripped
->8      belong  0               stripped
->22     byte    x               - version %ld
->23     byte    x               \b.%ld
-#
-0       beshort 0x6301          MIPSEB-LE MIPS-II ECOFF executable
->20     beshort 03401           (impure)
->20     beshort 04001           (swapped)
->20     beshort 05401           (paged)
->8      belong  >0              not stripped
->8      belong  0               stripped
->23     byte    x               - version %ld
->22     byte    x               \b.%ld
-#
-0       beshort 0x6601          MIPSEL MIPS-II ECOFF executable
->20     beshort 03401           (impure)
->20     beshort 04001           (swapped)
->20     beshort 05401           (paged)
->8      belong  >0              not stripped
->8      belong  0               stripped
->23     byte    x               - version %ld
->22     byte    x               \b.%ld
-# MIPS 3 additions
-#
-0       beshort 0x0140          MIPSEB MIPS-III ECOFF executable
->20     beshort 0407            (impure)
->20     beshort 0410            (swapped)
->20     beshort 0413            (paged)
->8      belong  >0              not stripped
->8      belong  0               stripped
->22     byte    x               - version %ld
->23     byte    x               \b.%ld
-#
-0       beshort 0x0142          MIPSEL-BE MIPS-III ECOFF executable
->20     beshort 0407            (impure)
->20     beshort 0410            (swapped)
->20     beshort 0413            (paged)
->8      belong  >0              not stripped
->8      belong  0               stripped
->22     byte    x               - version %ld
->23     byte    x               \b.%ld
-#
-0       beshort 0x4001          MIPSEB-LE MIPS-III ECOFF executable
->20     beshort 03401           (impure)
->20     beshort 04001           (swapped)
->20     beshort 05401           (paged)
->8      belong  >0              not stripped
->8      belong  0               stripped
->23     byte    x               - version %ld
->22     byte    x               \b.%ld
-#
-0       beshort 0x4201          MIPSEL MIPS-III ECOFF executable
->20     beshort 03401           (impure)
->20     beshort 04001           (swapped)
->20     beshort 05401           (paged)
->8      belong  >0              not stripped
->8      belong  0               stripped
->23     byte    x               - version %ld
->22     byte    x               \b.%ld
-#
-0       beshort 0x180           MIPSEB Ucode
-0       beshort 0x182           MIPSEL-BE Ucode
-
-
 # Windows CE package files
 0       string          MSCE\0\0\0\0    Microsoft WinCE installer
 >20     lelong          0               \b, architecture-independent
 #
 # 68K
 #
-0       beshort         0x0208          mc68k COFF
->18     beshort         ^00000020       object
->18     beshort         &00000020       executable
->12     belong          >0              not stripped
->168    string          .lowmem         Apple toolbox
->20     beshort         0407            (impure)
->20     beshort         0410            (pure)
->20     beshort         0413            (demand paged)
->20     beshort         0421            (standalone)
-0       beshort         0x0209          mc68k executable (shared)
->12     belong          >0              not stripped
-0       beshort         0x020A          mc68k executable (shared demand paged)
->12     belong          >0              not stripped
-
+# These signatures are useless without further sanity checking. Disable them until 
+# that can be implemented.
+#0       beshort         0x0208          mc68k COFF
+#>18     beshort         ^00000020       object
+#>18     beshort         &00000020       executable
+#>12     belong          >0              not stripped
+#>168    string          .lowmem         Apple toolbox
+#>20     beshort         0407            (impure)
+#>20     beshort         0410            (pure)
+#>20     beshort         0413            (demand paged)
+#>20     beshort         0421            (standalone)
+#0       beshort         0x0209          mc68k executable (shared)
+#>12     belong          >0              not stripped
+#0       beshort         0x020A          mc68k executable (shared demand paged)
+#>12     belong          >0              not stripped
 
-#
-# Motorola/UniSoft 68K Binary Compatibility Standard (BCS)
-#
-0       beshort         0x022A            68K BCS executable
-#
-# 88K
-#
-# Motorola/88Open BCS
-#
-0       beshort         0x022B            88K BCS executable
 
 #------------------------------------------------------------------------------
 # Sony Playstation executables (Adam Sjoegren <asjo@diku.dk>) :
 # cisco:  file(1) magic for cisco Systems routers
 #
 # Most cisco file-formats are covered by the generic elf code
-#
-# Microcode files are non-ELF, 0x8501 conflicts with NetBSD/alpha.
-0      beshort                 0x8501      cisco IOS
->0      belong&0xffffff00       0x85011400  microcode
->0      belong&0xffffff00       0x8501cb00  experimental microcode
->7      string                 >\0         for "%s"
+0      string                  \x85\x01\x14    Cisco IOS microcode
+>7      string                 >\0             
+>>7    string                  x               for "%s"
+0      string                  \x85\x01\xcb    Cisco IOS experimental microcode
+>7      string                 >\0             
+>>7    string                  x               for "%s"
 
 # EST flat binary format (which isn't, but anyway)
 # From: Mark Brown <broonie@sirena.org.uk>
 #--------------------File Systems---------------------
 
 # Minix filesystems - Juan Cespedes <cespedes@debian.org>
-0x410   leshort         0x137f          Minix filesystem
->0x402  beshort         !0              \b, %d zones
->0x1e   string          minix           \b, bootable
-0x410   leshort         0x138f          Minix filesystem, 30 char names
-0x410   leshort         0x2468          Minix filesystem, version 2
-0x410   leshort         0x2478          Minix filesystem, version 2, 30 char names
-0x410  leshort         0x4d5a          Minix filesystem, version 3
-0x410  leshort         0x4d6a          Minix filesystem, version 3, 30 char names
-
-0x410   beshort         0x137f          Minix filesystem (big endian)
->0x402  beshort         !0              \b, %d zones
->0x1e   string          minix           \b, bootable
-0x410   beshort         0x138f          Minix filesystem (big endian), 30 char names
-0x410   beshort         0x2468          Minix filesystem (big endian), version 2
-0x410   beshort         0x2478          Minix filesystem (big endian), version 2, 30 char names
-0x410  beshort         0x4d5a          Minix filesystem (big endian), version 3
-0x410  beshort         0x4d6a          Minix filesystem (big endian), version 3, 30 char names
+# These signatures are useless until they can be improved.
+#0x410   leshort         0x137f          Minix filesystem
+#>0x402  beshort         !0              \b, %d zones
+#>0x1e   string          minix           \b, bootable
+#0x410   leshort         0x138f          Minix filesystem, 30 char names
+#0x410   leshort         0x2468          Minix filesystem, version 2
+#0x410   leshort         0x2478          Minix filesystem, version 2, 30 char names
+#0x410 leshort         0x4d5a          Minix filesystem, version 3
+#0x410 leshort         0x4d6a          Minix filesystem, version 3, 30 char names
+
+#0x410   beshort         0x137f          Minix filesystem (big endian)
+#>0x402  beshort         !0              \b, %d zones
+#>0x1e   string          minix           \b, bootable
+#0x410   beshort         0x138f          Minix filesystem (big endian), 30 char names
+#0x410   beshort         0x2468          Minix filesystem (big endian), version 2
+#0x410   beshort         0x2478          Minix filesystem (big endian), version 2, 30 char names
+#0x410 beshort         0x4d5a          Minix filesystem (big endian), version 3
+#0x410 beshort         0x4d6a          Minix filesystem (big endian), version 3, 30 char names
 
 # YAFFS
 0      string  \x03\x00\x00\x00\x01\x00\x00\x00\xFF\xFF        YAFFS filesystem
 # files in between the JFFS2 file systems. This is an unlikely scenario however, and
 # the below signatures are much improved in terms of readability and accuracy in the
 # vast majority of real world scenarios.
-0              leshort 0x1985  JFFS2 filesystem, little endian{filter-include}
+0              leshort 0x1985  JFFS2 filesystem, little endian
 >2             leshort !0xE001
 >>2            leshort !0xE002
 >>>2           leshort !0x2003
 >>>>>>(4.l+1)   leshort !0xFFFF
 >>>>>>>(4.l+2)  leshort !0xFFFF
 >>>>>>>>(4.l+3) leshort !0xFFFF \b, invalid
+>4             lelong  0       invalid
+>4             lelong  <0      invalid
 >4             lelong  x       {one-of-many}{jump-to-offset:%d}
 
-0              beshort 0x1985  JFFS2 filesystem, big endian{filter-include}
+0              beshort 0x1985  JFFS2 filesystem, big endian
 >2             beshort !0xE001
 >>2            beshort !0xE002
 >>>2           beshort !0x2003
 >>>>>>(4.L+1)  beshort !0xFFFF
 >>>>>>>(4.L+2) beshort !0xFFFF
 >>>>>>>>(4.L+3)        beshort !0xFFFF \b, invalid
+>4             belong  0       invalid
+>4             belong  <0      invalid
 >4             belong  x       {one-of-many}{jump-to-offset:%d}
 
 
 # ext4 filesystem - Eric Sandeen <sandeen@sandeen.net>
 # volume label and UUID Russell Coker
 # http://etbe.coker.com.au/2008/07/08/label-vs-uuid-vs-device/
-0   leshort         0xEF53             Linux EXT filesystem,{filter-include}{offset-adjust:-0x438}
+0   leshort         0xEF53             Linux EXT filesystem,{offset-adjust:-0x438}
 >2     leshort         >4              invalid state
 >2     leshort         3               invalid state
 >2     leshort         <0              invalid state
 # Ubicom firmware image
 0      belong  0xFA320080              Ubicom firmware header,
 >12    belong  x                       checksum: 0x%X,
+>24    belong  <0                      invalid
 >24    belong  x                       image size: %d
 
 # The ROME bootloader is used by several RealTek-based products.
 
 # PackImg tag, somtimes used as a delimiter between the kernel and rootfs in firmware images.
 0      string          --PaCkImGs--    PackImg section delimiter tag,
->16    lelong          x               little endian size: %d bytes;
->16    belong          x               big endian size: %d bytes
+# If the size in both big and little endian is greater than 512MB, consider this a false positive
+>16    lelong          >0x20000000
+>>16   belong          >0x20000000     invalid
+>16    lelong          <0
+>>16   belong          <0              invalid
+>16    lelong          >0
+>>16   lelong          x               little endian size: %d bytes;
+>16    belong          >0              
+>>16   belong          x               big endian size: %d bytes
 
 
 #------------------------------------------------------------------------------
 # Broadcom header format
 #
 0       string          BCRM            Broadcom header,
+>4     lelong          <0              invalid
 >4      lelong          x               number of sections: %d,
 >>8     lelong          18              first section type: flash
 >>8     lelong          19              first section type: disk
 
 #
 # Motorola S-Records, from Gerd Truschinski <gt@freebsd.first.gmd.de>
-0   string      S0          Motorola S-Record; binary data in text format
+# Useless until forther improvements can be made to the signature.
+#0   string      S0          Motorola S-Record; binary data in text format
 
 # --------------------------------
 # Microsoft Xbox data file formats
 >0x8C          belong          x                               bootloader offset: %d,
 >0x90          belong          x                               bootloader length: %d
 
+# Header format from: http://skaya.enix.org/wiki/FirmwareFormat
+0      string          \x36\x00\x00\x00                Broadcom 96345 firmware header, header size: 256,
+>4     string          !Broadcom
+>>4    string          !\x20\x20\x20\x20               invalid
+>41    beshort         !0x2020
+>>41   beshort         !0x0000
+>>>41  string          x                               firmware version: "%.4s",
+>45    beshort         !0x0202
+>>45   beshort         !0x0000
+>>>45  string          x                               board id: "%s",
+>236   belong          x                               ~CRC32 header checksum: 0x%X,
+>216   belong          x                               ~CRC32 data checksum: 0x%X
+
+# Xerox MFP DLM signatures
+0      string          %%XRXbegin                      Xerox DLM firmware start of header
+0      string          %%OID_ATT_DLM_NAME              Xerox DLM firmware name:
+>19    string          x                               "%s"
+0      string          %%OID_ATT_DLM_VERSION           Xerox DLM firmware version:
+>22    string          x                               "%s"
+0      string          %%XRXend                        Xerox DLM firmware end of header
+
+# Generic copyright signature
+0      string          Copyright                       Copyright string:
+>9     byte            0                               invalid
+>9     string          x                               "%s
+>40    string          x                               \b%s"
+
+# Sercomm firmware header
+0      string          sErCoMm                         Sercomm firmware signature,
+>7     leshort         x                               version control: %d,
+>9     leshort         x                               download control: %d,
+>11    string          x                               hardware ID: "%s",
+>44    leshort         x                               hardware version: 0x%X,
+>58    leshort         x                               firmware version: 0x%X,
+>60    leshort         x                               starting code segment: 0x%X,
+>62    leshort         x                               code size: 0x%X
 # Tag Image File Format, from Daniel Quinlan (quinlan@yggdrasil.com)
 # The second word of TIFF files is the TIFF version number, 42, which has
 # never changed.  The TIFF specification recommends testing for it.
 
 # GIF
 0       string          GIF8            GIF image data
->4      string          7a              \b, version 8"%s",
->4      string          9a              \b, version 8"%s",
+>4      string          7a              \b, version "8%s",
+>4      string          9a              \b, version "8%s",
 >6      leshort         >0              %hd x
 >8      leshort         >0              %hd
 #>10    byte            &0x80           color mapped,
 #
 # both of which turn into "JPEG image data" here.
 #
-0       beshort         0xffd8          JPEG image data
->6      string          JFIF            \b, JFIF standard
+0       belong         0xffd8ffe0      JPEG image data, JFIF standard 
+>6      string         !JFIF           invalid
 # The following added by Erik Rossen <rossen@freesurf.ch> 1999-09-06
 # in a vain attempt to add image size reporting for JFIF.  Note that these
 # tests are not fool-proof since some perfectly valid JPEGs are currently
 # impossible to specify in magic(4) format.
 # First, a little JFIF version info:
->>11    byte            x               \b %d.
->>12    byte            x               \b%02d
+>11    byte            x               \b %d.
+>12    byte            x               \b%02d
 # Next, the resolution or aspect ratio of the image:
 #>>13   byte            0               \b, aspect ratio
 #>>13   byte            1               \b, resolution (DPI)
 #>>13   byte            2               \b, resolution (DPCM)
 #>>4    beshort         x               \b, segment length %d
 # Next, show thumbnail info, if it exists:
->>18    byte            !0              \b, thumbnail %dx
->>>19   byte            x               \b%d
+>18    byte            !0              \b, thumbnail %dx
+>>19   byte            x               \b%d
 
+
+0      belong          0xffd8ffe1      JPEG image data, EXIF standard
 # EXIF moved down here to avoid reporting a bogus version number,
 # and EXIF version number printing added.
 #   - Patrik R=E5dman <patrik+file-magic@iki.fi>
->6      string          Exif            \b, EXIF standard
+>6      string          !Exif            invalid
 # Look for EXIF IFD offset in IFD 0, and then look for EXIF version tag in EXIF IFD.
 # All possible combinations of entries have to be enumerated, since no looping
 # is possible. And both endians are possible...
 # The combinations included below are from real-world JPEGs.
 # Little-endian
->>12    string          II
+>12    string          II
 # IFD 0 Entry #5:
->>>70   leshort         0x8769
+>>70   leshort         0x8769
 # EXIF IFD Entry #1:
->>>>(78.l+14)   leshort 0x9000
->>>>>(78.l+23)  byte    x               %c
->>>>>(78.l+24)  byte    x               \b.%c
->>>>>(78.l+25)  byte    !0x30           \b%c
+>>>(78.l+14)   leshort 0x9000
+>>>>(78.l+23)  byte    x               %c
+>>>>(78.l+24)  byte    x               \b.%c
+>>>>(78.l+25)  byte    !0x30           \b%c
 # IFD 0 Entry #9:
->>>118  leshort         0x8769
+>>118  leshort         0x8769
 # EXIF IFD Entry #3:
->>>>(126.l+38)  leshort 0x9000
->>>>>(126.l+47) byte    x               %c
->>>>>(126.l+48) byte    x               \b.%c
->>>>>(126.l+49) byte    !0x30           \b%c
+>>>(126.l+38)  leshort 0x9000
+>>>>(126.l+47) byte    x               %c
+>>>>(126.l+48) byte    x               \b.%c
+>>>>(126.l+49) byte    !0x30           \b%c
 # IFD 0 Entry #10
->>>130  leshort         0x8769
+>>130  leshort         0x8769
 # EXIF IFD Entry #3:
->>>>(138.l+38)  leshort 0x9000
->>>>>(138.l+47) byte    x               %c
->>>>>(138.l+48) byte    x               \b.%c
->>>>>(138.l+49) byte    !0x30           \b%c
+>>>(138.l+38)  leshort 0x9000
+>>>>(138.l+47) byte    x               %c
+>>>>(138.l+48) byte    x               \b.%c
+>>>>(138.l+49) byte    !0x30           \b%c
 # EXIF IFD Entry #4:
->>>>(138.l+50)  leshort 0x9000
->>>>>(138.l+59) byte    x               %c
->>>>>(138.l+60) byte    x               \b.%c
->>>>>(138.l+61) byte    !0x30           \b%c
+>>>(138.l+50)  leshort 0x9000
+>>>>(138.l+59) byte    x               %c
+>>>>(138.l+60) byte    x               \b.%c
+>>>>(138.l+61) byte    !0x30           \b%c
 # EXIF IFD Entry #5:
->>>>(138.l+62)  leshort 0x9000
->>>>>(138.l+71) byte    x               %c
->>>>>(138.l+72) byte    x               \b.%c
->>>>>(138.l+73) byte    !0x30           \b%c
+>>>(138.l+62)  leshort 0x9000
+>>>>(138.l+71) byte    x               %c
+>>>>(138.l+72) byte    x               \b.%c
+>>>>(138.l+73) byte    !0x30           \b%c
 # IFD 0 Entry #11
->>>142  leshort         0x8769
+>>142  leshort         0x8769
 # EXIF IFD Entry #3:
->>>>(150.l+38)  leshort 0x9000
->>>>>(150.l+47) byte    x               %c
->>>>>(150.l+48) byte    x               \b.%c
->>>>>(150.l+49) byte    !0x30           \b%c
+>>>(150.l+38)  leshort 0x9000
+>>>>(150.l+47) byte    x               %c
+>>>>(150.l+48) byte    x               \b.%c
+>>>>(150.l+49) byte    !0x30           \b%c
 # EXIF IFD Entry #4:
->>>>(150.l+50)  leshort 0x9000
->>>>>(150.l+59) byte    x               %c
->>>>>(150.l+60) byte    x               \b.%c
->>>>>(150.l+61) byte    !0x30           \b%c
+>>>(150.l+50)  leshort 0x9000
+>>>>(150.l+59) byte    x               %c
+>>>>(150.l+60) byte    x               \b.%c
+>>>>(150.l+61) byte    !0x30           \b%c
 # EXIF IFD Entry #5:
->>>>(150.l+62)  leshort 0x9000
->>>>>(150.l+71) byte    x               %c
->>>>>(150.l+72) byte    x               \b.%c
->>>>>(150.l+73) byte    !0x30           \b%c
+>>>(150.l+62)  leshort 0x9000
+>>>>(150.l+71) byte    x               %c
+>>>>(150.l+72) byte    x               \b.%c
+>>>>(150.l+73) byte    !0x30           \b%c
 # Big-endian
->>12    string          MM
+>12    string          MM
 # IFD 0 Entry #9:
->>>118  beshort         0x8769
+>>118  beshort         0x8769
 # EXIF IFD Entry #1:
->>>>(126.L+14)  beshort 0x9000
->>>>>(126.L+23) byte    x               %c
->>>>>(126.L+24) byte    x               \b.%c
->>>>>(126.L+25) byte    !0x30           \b%c
+>>>(126.L+14)  beshort 0x9000
+>>>>(126.L+23) byte    x               %c
+>>>>(126.L+24) byte    x               \b.%c
+>>>>(126.L+25) byte    !0x30           \b%c
 # EXIF IFD Entry #3:
->>>>(126.L+38)  beshort 0x9000
->>>>>(126.L+47) byte    x               %c
->>>>>(126.L+48) byte    x               \b.%c
->>>>>(126.L+49) byte    !0x30           \b%c
+>>>(126.L+38)  beshort 0x9000
+>>>>(126.L+47) byte    x               %c
+>>>>(126.L+48) byte    x               \b.%c
+>>>>(126.L+49) byte    !0x30           \b%c
 # IFD 0 Entry #10
->>>130  beshort         0x8769
+>>130  beshort         0x8769
 # EXIF IFD Entry #3:
->>>>(138.L+38)  beshort 0x9000
->>>>>(138.L+47) byte    x               %c
->>>>>(138.L+48) byte    x               \b.%c
->>>>>(138.L+49) byte    !0x30           \b%c
+>>>(138.L+38)  beshort 0x9000
+>>>>(138.L+47) byte    x               %c
+>>>>(138.L+48) byte    x               \b.%c
+>>>>(138.L+49) byte    !0x30           \b%c
 # EXIF IFD Entry #5:
->>>>(138.L+62)  beshort 0x9000
->>>>>(138.L+71) byte    x               %c
->>>>>(138.L+72) byte    x               \b.%c
->>>>>(138.L+73) byte    !0x30           \b%c
+>>>(138.L+62)  beshort 0x9000
+>>>>(138.L+71) byte    x               %c
+>>>>(138.L+72) byte    x               \b.%c
+>>>>(138.L+73) byte    !0x30           \b%c
 # IFD 0 Entry #11
->>>142  beshort         0x8769
+>>142  beshort         0x8769
 # EXIF IFD Entry #4:
->>>>(150.L+50)  beshort 0x9000
->>>>>(150.L+59) byte    x               %c
->>>>>(150.L+60) byte    x               \b.%c
->>>>>(150.L+61) byte    !0x30           \b%c
+>>>(150.L+50)  beshort 0x9000
+>>>>(150.L+59) byte    x               %c
+>>>>(150.L+60) byte    x               \b.%c
+>>>>(150.L+61) byte    !0x30           \b%c
 # Here things get sticky.  We can do ONE MORE marker segment with
 # indirect addressing, and that's all.  It would be great if we could
 # do pointer arithemetic like in an assembler language.  Christos?
 #>14    beshort         x               \b, %d x
 #>16    beshort         x               \b %d
 
+0      string  M88888888888888888888888888     Binwalk logo, ASCII art (Toph){offset-adjust:-50}
+>27    string  !8888888888\n                   invalid
+
 
 #-------------------------Kernels-------------------------------------
 
 >14    byte            0                       invalid
 >14    byte            !0
 >>14   string          x                       "%s
->>45   string          x                       \b%s
->>76   string          x                       \b%s
->>107  string          x                       \b%s"
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x5D
-# ------------------------------------------------------------------
-0              string  \x5D\x00\x00    LZMA compressed data, properties: 0x5D,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x01
-# ------------------------------------------------------------------
-0              string  \x01\x00\x00    LZMA compressed data, properties: 0x01,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x02
-# ------------------------------------------------------------------
-0              string  \x02\x00\x00    LZMA compressed data, properties: 0x02,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
+>>45   string          x                       \b%s"
 
 # ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x03
+# Signature for LZMA compressed data with valid properties byte 0x40
 # ------------------------------------------------------------------
-0              string  \x03\x00\x00    LZMA compressed data, properties: 0x03,
+0              string  \x40\x00\x00    LZMA compressed data, properties: 0x40,
 
 # These are all the valid dictionary sizes supported by LZMA utils.
 >1             lelong  !65536  
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 
 
 # ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x04
+# Signature for LZMA compressed data with valid properties byte 0x41
 # ------------------------------------------------------------------
-0              string  \x04\x00\x00    LZMA compressed data, properties: 0x04,
+0              string  \x41\x00\x00    LZMA compressed data, properties: 0x41,
 
 # These are all the valid dictionary sizes supported by LZMA utils.
 >1             lelong  !65536  
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 
 
 # ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x09
+# Signature for LZMA compressed data with valid properties byte 0x48
 # ------------------------------------------------------------------
-0              string  \x09\x00\x00    LZMA compressed data, properties: 0x09,
+0              string  \x48\x00\x00    LZMA compressed data, properties: 0x48,
 
 # These are all the valid dictionary sizes supported by LZMA utils.
 >1             lelong  !65536  
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 
 
 # ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x0A
+# Signature for LZMA compressed data with valid properties byte 0x49
 # ------------------------------------------------------------------
-0              string  \x0A\x00\x00    LZMA compressed data, properties: 0x0A,
+0              string  \x49\x00\x00    LZMA compressed data, properties: 0x49,
 
 # These are all the valid dictionary sizes supported by LZMA utils.
 >1             lelong  !65536  
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 
 
 # ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x0B
+# Signature for LZMA compressed data with valid properties byte 0x51
 # ------------------------------------------------------------------
-0              string  \x0B\x00\x00    LZMA compressed data, properties: 0x0B,
+0              string  \x51\x00\x00    LZMA compressed data, properties: 0x51,
 
 # These are all the valid dictionary sizes supported by LZMA utils.
 >1             lelong  !65536  
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 
 
 # ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x0C
+# Signature for LZMA compressed data with valid properties byte 0x5A
 # ------------------------------------------------------------------
-0              string  \x0C\x00\x00    LZMA compressed data, properties: 0x0C,
+0              string  \x5A\x00\x00    LZMA compressed data, properties: 0x5A,
 
 # These are all the valid dictionary sizes supported by LZMA utils.
 >1             lelong  !65536  
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 
 
 # ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x12
+# Signature for LZMA compressed data with valid properties byte 0x5B
 # ------------------------------------------------------------------
-0              string  \x12\x00\x00    LZMA compressed data, properties: 0x12,
+0              string  \x5B\x00\x00    LZMA compressed data, properties: 0x5B,
 
 # These are all the valid dictionary sizes supported by LZMA utils.
 >1             lelong  !65536  
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 
 
 # ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x13
+# Signature for LZMA compressed data with valid properties byte 0x5C
 # ------------------------------------------------------------------
-0              string  \x13\x00\x00    LZMA compressed data, properties: 0x13,
+0              string  \x5C\x00\x00    LZMA compressed data, properties: 0x5C,
 
 # These are all the valid dictionary sizes supported by LZMA utils.
 >1             lelong  !65536  
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 
 
 # ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x14
+# Signature for LZMA compressed data with valid properties byte 0x5D
 # ------------------------------------------------------------------
-0              string  \x14\x00\x00    LZMA compressed data, properties: 0x14,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x1B
-# ------------------------------------------------------------------
-0              string  \x1B\x00\x00    LZMA compressed data, properties: 0x1B,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x1C
-# ------------------------------------------------------------------
-0              string  \x1C\x00\x00    LZMA compressed data, properties: 0x1C,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x24
-# ------------------------------------------------------------------
-0              string  \x24\x00\x00    LZMA compressed data, properties: 0x24,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x2D
-# ------------------------------------------------------------------
-0              string  \x2D\x00\x00    LZMA compressed data, properties: 0x2D,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x2E
-# ------------------------------------------------------------------
-0              string  \x2E\x00\x00    LZMA compressed data, properties: 0x2E,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x2F
-# ------------------------------------------------------------------
-0              string  \x2F\x00\x00    LZMA compressed data, properties: 0x2F,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x30
-# ------------------------------------------------------------------
-0              string  \x30\x00\x00    LZMA compressed data, properties: 0x30,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x31
-# ------------------------------------------------------------------
-0              string  \x31\x00\x00    LZMA compressed data, properties: 0x31,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x36
-# ------------------------------------------------------------------
-0              string  \x36\x00\x00    LZMA compressed data, properties: 0x36,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x37
-# ------------------------------------------------------------------
-0              string  \x37\x00\x00    LZMA compressed data, properties: 0x37,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x38
-# ------------------------------------------------------------------
-0              string  \x38\x00\x00    LZMA compressed data, properties: 0x38,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x39
-# ------------------------------------------------------------------
-0              string  \x39\x00\x00    LZMA compressed data, properties: 0x39,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x3F
-# ------------------------------------------------------------------
-0              string  \x3F\x00\x00    LZMA compressed data, properties: 0x3F,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x40
-# ------------------------------------------------------------------
-0              string  \x40\x00\x00    LZMA compressed data, properties: 0x40,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x41
-# ------------------------------------------------------------------
-0              string  \x41\x00\x00    LZMA compressed data, properties: 0x41,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x48
-# ------------------------------------------------------------------
-0              string  \x48\x00\x00    LZMA compressed data, properties: 0x48,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x49
-# ------------------------------------------------------------------
-0              string  \x49\x00\x00    LZMA compressed data, properties: 0x49,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x51
-# ------------------------------------------------------------------
-0              string  \x51\x00\x00    LZMA compressed data, properties: 0x51,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x5A
-# ------------------------------------------------------------------
-0              string  \x5A\x00\x00    LZMA compressed data, properties: 0x5A,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x5B
-# ------------------------------------------------------------------
-0              string  \x5B\x00\x00    LZMA compressed data, properties: 0x5B,
-
-# These are all the valid dictionary sizes supported by LZMA utils.
->1             lelong  !65536  
->>1            lelong  !131072 
->>>1           lelong  !262144 
->>>>1          lelong  !524288 
->>>>>1         lelong  !1048576        
->>>>>>1                lelong  !2097152        
->>>>>>>1       lelong  !4194304        
->>>>>>>>1      lelong  !8388608        
->>>>>>>>>1     lelong  !16777216       
->>>>>>>>>>1    lelong  !33554432       invalid
->1             lelong  x               dictionary size: %d bytes,
-
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
-
-# These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
-# Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
-# marking matches with the same uncompressed and dictionary sizes as invalid eliminates much of these false positives.
->1             lelong  65536
->>5            lequad  65536           invalid
->1             lelong  131072
->>5            lequad  131072          invalid
->1             lelong  262144
->>5            lequad  262144          invalid
->1             lelong  524288
->>5            lequad  524288          invalid
->1             lelong  1048576
->>5            lequad  1048576         invalid
->1             lelong  2097152
->>5            lequad  2097152         invalid
->1             lelong  4194304
->>5            lequad  4194304         invalid
->1             lelong  8388608
->>5            lequad  8388608         invalid
->1             lelong  16777216
->>5            lequad  16777216        invalid
->1             lelong  33554432
->>5            lequad  33554432        invalid
->5             lequad  x               uncompressed size: %lld bytes
-
-
-# ------------------------------------------------------------------
-# Signature for LZMA compressed data with valid properties byte 0x5C
-# ------------------------------------------------------------------
-0              string  \x5C\x00\x00    LZMA compressed data, properties: 0x5C,
+0              string  \x5D\x00\x00    LZMA compressed data, properties: 0x5D,
 
 # These are all the valid dictionary sizes supported by LZMA utils.
 >1             lelong  !65536  
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
 >>>>>>>>>>1    lelong  !33554432       invalid
 >1             lelong  x               dictionary size: %d bytes,
 
-# Assume that a valid size will be less than 1GB. This could technically be valid, but is unlikely.
->5             lequad  <1              invalid
->5             lequad  >0x40000000     invalid
+# Assume that a valid size will be greater than 32 bytes and less than 1GB (a value of -1 IS valid).
+# This could technically be valid, but is unlikely.
+>5             lequad  !-1
+>>5            lequad  <32             invalid
+>>5            lequad  >0x40000000     invalid
 
 # These are not 100%. The uncompressed size could be exactly the same as the dicionary size, but it is unlikely.
 # Since most false positives are the result of repeating sequences of bytes (such as executable instructions),
index e4b5861..97a9fa7 100644 (file)
@@ -1,3 +1,4 @@
+import re
 import os.path
 import tempfile
 from common import str2int
@@ -23,9 +24,6 @@ class MagicParser:
        All magic files generated by this class will be deleted when the class deconstructor is called.
        '''
 
-       SHORT_SIZE = 2
-       SHORTS = ['beshort', 'leshort', 'byte']
-
        BIG_ENDIAN = 'big'
        LITTLE_ENDIAN = 'little'
 
@@ -37,18 +35,6 @@ class MagicParser:
        # If libmagic returns multiple results, they are delimited with this string.    
        RESULT_SEPERATOR = "\\012- "
 
-       # Size of the keys used in the matches set. Limited to 2
-       # as the key is the magic signature of a given magic file entry.
-       # Entries can have variable length signatures, but the lowest 
-       # common demonitor is 2, so the first two bytes of the signature
-       # is used as the key. Does this result in collisions and false
-       # positives? Yes. But false positives are filtered out by the
-       # MagicFilter class. The main purpose of MagicParser.match is to
-       # limit the number of calls to libmagic without itself incurring
-       # large computational overhead. And for that purpose, this is
-       # quite effective.
-       MATCH_INDEX_SIZE = 2
-
        def __init__(self, filter=None, smart=None):
                '''
                Class constructor.
@@ -60,7 +46,6 @@ class MagicParser:
                '''
                self.matches = set([])
                self.signatures = {}
-               self.sigset = {}
                self.filter = filter
                self.smart = smart
                self.raw_fd = None
@@ -68,10 +53,10 @@ class MagicParser:
                self.fd = tempfile.NamedTemporaryFile()
 
        def __del__(self):
-               '''
-               Class deconstructor.
-               '''
-               self.cleanup()
+               try:
+                       self.cleanup()
+               except:
+                       pass
 
        def cleanup(self):
                '''
@@ -105,38 +90,34 @@ class MagicParser:
                self.raw_fd.seek(0)
                return self.raw_fd.name
 
-       def parse(self, file_name, filter_short_signatures=True, pre_filter_signatures=True):
+       def parse(self, file_name):
                '''
                Parses magic file(s) and contatenates them into a single temporary magic file
                while simultaneously removing filtered signatures.
 
-               @file_name                      - Magic file, or list of magic files, to parse.
-               @filter_short_signatures        - Set to False to include entries with short (2 byte) magic signatures.
-               @pre_filter_signatures          - Set to False to disable smart signature keywords.
+               @file_name - Magic file, or list of magic files, to parse.
 
                Returns the name of the generated temporary magic file, which will be automatically
                deleted when the class deconstructor is called.
                '''
-               if type(file_name) == type([]):
+               if isinstance(file_name, type([])):
                        files = file_name
                else:
                        files = [file_name]
 
                for fname in files:
                        if os.path.exists(fname):
-                               self.parse_file(fname, filter_short_signatures, pre_filter_signatures)
+                               self.parse_file(fname)
 
                self.fd.seek(0)
                return self.fd.name
 
-      &nb