Imported Upstream version 1.2.1
[packages/binwalk.git] / binwalk / parser.py
index e4b5861..97a9fa7 100644 (file)
@@ -1,3 +1,4 @@
+import re
 import os.path
 import tempfile
 from common import str2int
@@ -23,9 +24,6 @@ class MagicParser:
        All magic files generated by this class will be deleted when the class deconstructor is called.
        '''
 
-       SHORT_SIZE = 2
-       SHORTS = ['beshort', 'leshort', 'byte']
-
        BIG_ENDIAN = 'big'
        LITTLE_ENDIAN = 'little'
 
@@ -37,18 +35,6 @@ class MagicParser:
        # If libmagic returns multiple results, they are delimited with this string.    
        RESULT_SEPERATOR = "\\012- "
 
-       # Size of the keys used in the matches set. Limited to 2
-       # as the key is the magic signature of a given magic file entry.
-       # Entries can have variable length signatures, but the lowest 
-       # common demonitor is 2, so the first two bytes of the signature
-       # is used as the key. Does this result in collisions and false
-       # positives? Yes. But false positives are filtered out by the
-       # MagicFilter class. The main purpose of MagicParser.match is to
-       # limit the number of calls to libmagic without itself incurring
-       # large computational overhead. And for that purpose, this is
-       # quite effective.
-       MATCH_INDEX_SIZE = 2
-
        def __init__(self, filter=None, smart=None):
                '''
                Class constructor.
@@ -60,7 +46,6 @@ class MagicParser:
                '''
                self.matches = set([])
                self.signatures = {}
-               self.sigset = {}
                self.filter = filter
                self.smart = smart
                self.raw_fd = None
@@ -68,10 +53,10 @@ class MagicParser:
                self.fd = tempfile.NamedTemporaryFile()
 
        def __del__(self):
-               '''
-               Class deconstructor.
-               '''
-               self.cleanup()
+               try:
+                       self.cleanup()
+               except:
+                       pass
 
        def cleanup(self):
                '''
@@ -105,38 +90,34 @@ class MagicParser:
                self.raw_fd.seek(0)
                return self.raw_fd.name
 
-       def parse(self, file_name, filter_short_signatures=True, pre_filter_signatures=True):
+       def parse(self, file_name):
                '''
                Parses magic file(s) and contatenates them into a single temporary magic file
                while simultaneously removing filtered signatures.
 
-               @file_name                      - Magic file, or list of magic files, to parse.
-               @filter_short_signatures        - Set to False to include entries with short (2 byte) magic signatures.
-               @pre_filter_signatures          - Set to False to disable smart signature keywords.
+               @file_name - Magic file, or list of magic files, to parse.
 
                Returns the name of the generated temporary magic file, which will be automatically
                deleted when the class deconstructor is called.
                '''
-               if type(file_name) == type([]):
+               if isinstance(file_name, type([])):
                        files = file_name
                else:
                        files = [file_name]
 
                for fname in files:
                        if os.path.exists(fname):
-                               self.parse_file(fname, filter_short_signatures, pre_filter_signatures)
+                               self.parse_file(fname)
 
                self.fd.seek(0)
                return self.fd.name
 
-       def parse_file(self, file_name, filter_short_signatures=True, pre_filter_signatures=True):
+       def parse_file(self, file_name):
                '''
                Parses a magic file and appends valid signatures to the temporary magic file, as allowed
                by the existing filter rules.
 
-               @file_name                      - Magic file to parse.
-               @filter_short_signatures        - Set to False to include entries with short (2 byte) magic signatures.
-               @pre_filter_signatures          - Set to False to disable smart signature keywords.
+               @file_name - Magic file to parse.
                
                Returns None.
                '''
@@ -153,61 +134,29 @@ class MagicParser:
                                entry = self._parse_line(line)
 
                                if entry is not None:
-                                       # Once an entry is identified, default to excluding the entry
-                                       include = False
-
-                                       if pre_filter_signatures:
-                                               # If the smart signature include keyword is specified for this entry,
-                                               # add an include filter for this signature description.
-                                               if self.smart.include(entry['description']):
-                                                       self.filter.include(entry['description'], exclusive=False)
-                                                       include = True
-
-                                       # If we haven't already explicitly included this entry, and we are 
-                                       # filtering out short signatures and this is a short signature, then 
-                                       # add an exclude filter for this signature description
-                                       if not include and filter_short_signatures and self._is_short(entry):
-                                               self.filter.exclude(entry['description'])
-
                                        # If this signature is marked for inclusion, include it.
                                        if self.filter.filter(entry['description']) == self.filter.FILTER_INCLUDE:
-                                               include = True
-                                       
-                                       if include:
+
+                                               include = True  
                                                self.signature_count += 1
 
                                                if not self.signatures.has_key(entry['offset']):
                                                        self.signatures[entry['offset']] = []
                                                
-                                               if entry['condition'][:self.MATCH_INDEX_SIZE] not in self.signatures[entry['offset']]:
-                                                       self.signatures[entry['offset']].append(entry['condition'][:self.MATCH_INDEX_SIZE])
+                                               if entry['condition'] not in self.signatures[entry['offset']]:
+                                                       self.signatures[entry['offset']].append(entry['condition'])
+                                       else:
+                                               include = False
 
                                # Keep writing lines of the signature to the temporary magic file until 
                                # we detect a signature that should not be included.
                                if include:
                                        self.fd.write(line)
+
+                       self.build_signature_set()                      
                except Exception, e:
                        raise Exception("Error parsing magic file '%s' on line %d: %s" % (file_name, line_count, str(e)))
 
-               # Generate a dictionary of offsets with a set of signatures     
-               for (offset, siglist) in self.signatures.iteritems():
-                       self.sigset[offset] = set(siglist)
-
-       def _is_short(self, entry):
-               '''
-               Determines if a signature entry has a short (2 byte) signature or not.
-
-               @entry - Entry dictionary, as returned by self._parse_line().
-               
-               Returns True if the signature is short, False if not short.
-               '''
-               if entry['type'] in self.SHORTS:
-                       return True
-               elif 'string' in entry['type']:
-                       if len(entry['condition'].decode('string_escape')) <= self.SHORT_SIZE:
-                               return True
-               return False
-
        def _parse_line(self, line):
                '''
                Parses a signature line into its four parts (offset, type, condition and description),
@@ -253,7 +202,7 @@ class MagicParser:
                        raise Exception("%s :: %s", (str(e), line))
 
                # If this is a string, get the length of the string
-               if 'string' in entry['type']:
+               if 'string' in entry['type'] or entry['condition'] == self.WILDCARD:
                        entry['length'] = len(entry['condition'])
                # Else, we need to jump through a few more hoops...
                else:   
@@ -267,14 +216,10 @@ class MagicParser:
                        # Try to convert the condition to an integer. This does not allow
                        # for more advanced conditions for the first line of a signature, 
                        # but needing that is rare.
-                       if entry['condition'] != self.WILDCARD:
-                               try:
-                                       intval = str2int(entry['condition'].strip('L'))
-                               except Exception, e:
-                                       raise Exception("Failed to evaluate condition for '%s' type: '%s', condition: '%s', error: %s" % (entry['description'], entry['type'], entry['condition'], str(e)))
-                       else:
-                               intval = 0
-                               entry['length'] = 1
+                       try:
+                               intval = str2int(entry['condition'].strip('L'))
+                       except Exception, e:
+                               raise Exception("Failed to evaluate condition for '%s' type: '%s', condition: '%s', error: %s" % (entry['description'], entry['type'], entry['condition'], str(e)))
 
                        # How long is the field type?
                        if entry['type'] == 'byte':
@@ -295,15 +240,41 @@ class MagicParser:
                '''
                Builds a list of signature tuples.
 
-               Returns a list of tuples in the format: [(<signature offset>, [set of 2-byte signatures])].
+               Returns a list of tuples in the format: [(<signature offset>, [signature regex])].
+               '''
+               signature_set = []
+
+               for (offset, sigs) in self.signatures.iteritems():
+                       for sig in sigs:
+                               if sig == self.WILDCARD:
+                                       sig = re.compile('.')
+                               else:
+                                       sig = re.compile(re.escape(sig))
+
+                               signature_set.append(sig)
+
+               self.signature_set = set(signature_set)
+
+               return self.signature_set
+
+       def find_signature_candidates(self, data):
                '''
-               signatures = []
+               Finds candidate signatures inside of the data buffer.
+               Called internally by Binwalk.single_scan.
+
+               @data - Data to scan for candidate signatures.
+
+               Returns an ordered list of offsets inside of data at which candidate offsets were found.
+               '''
+               candidate_offsets = []
+
+               for regex in self.signature_set:
+                       candidate_offsets += [match.start() for match in regex.finditer(data)]
 
-               for (offset, sigset) in self.sigset.iteritems():
-                       signatures.append((offset, sigset))
+               candidate_offsets = list(set(candidate_offsets))
+               candidate_offsets.sort()
 
-               signatures.sort()
-               return signatures
+               return candidate_offsets
 
        def _to_string(self, value, size, endianess):
                '''