Imported Upstream version 1.2.1
[packages/binwalk.git] / binwalk / extractor.py
index fdcd511..f0573c3 100644 (file)
@@ -1,10 +1,11 @@
 import os
+import re
 import sys
 import shlex
 import tempfile
 import subprocess
 from config import *
-from common import file_size
+from common import file_size, unique_file_name
 
 class Extractor:
        '''
@@ -20,7 +21,7 @@ class Extractor:
                # Create extraction rules for scan results containing the string 'gzip compressed data' and 'filesystem'.
                # The former will be saved to disk with a file extension of 'gz' and the command 'gunzip <file name on disk>' will be executed (note the %e placeholder).
                # The latter will be saved to disk with a file extension of 'fs' and no command will be executed.
-               # These rules will take precedence over subsequent rules with the same match string.
+               # These rules will be ignored if there were previous rules with the same match string.
                bw.extractor.add_rule(['gzip compressed data:gz:gunzip %e', 'filesystem:fs'])
 
                # Load the extraction rules from the default extract.conf file(s).
@@ -40,6 +41,9 @@ class Extractor:
        # Place holder for the extracted file name in the command 
        FILE_NAME_PLACEHOLDER = '%e'
 
+       # Max size of data to read/write at one time when extracting data
+       MAX_READ_SIZE = 10 * 1024 * 1024
+
        def __init__(self, verbose=False):
                '''
                Class constructor.
@@ -54,46 +58,98 @@ class Extractor:
                self.verbose = verbose
                self.extract_rules = {}
                self.remove_after_execute = False
+               self.extract_path = os.getcwd()
 
-       def add_rule(self, rule):
+       def add_rule(self, txtrule=None, regex=None, extension=None, cmd=None):
                '''
                Adds a set of rules to the extraction rule list.
 
-               @rule - Rule string, or list of rule strings, in the format <case insensitive matching string>:<file extension>[:<command to run>]
+               @txtrule   - Rule string, or list of rule strings, in the format <regular expression>:<file extension>[:<command to run>]
+               @regex     - If rule string is not specified, this is the regular expression string to use.
+               @extension - If rule string is not specified, this is the file extension to use.
+               @cmd       - If rule string is not specified, this is the command to run.
+                            Alternatively a callable object may be specified, which will be passed one argument: the path to the file to extract.
 
                Returns None.
                '''
+               rules = []
+               match = False
                r = {
                        'extension'     : '',
-                       'cmd'           : ''
+                       'cmd'           : '',
+                       'regex'         : None
                }
 
-               if type(rule) != type([]):
-                       rules = [rule]
-               else:
-                       rules = rule
+               if not txtrule and regex and extension:
+                       txtrule = '%s:%s' % (regex, extension)
+                       if cmd:
+                               txtrule += ':%s' % cmd
 
+               if not isinstance(txtrule, type([])):
+                       rules = [txtrule]
+               else:
+                       rules = txtrule
+               
                for rule in rules:
                        r['cmd'] = ''
                        r['extension'] = ''
 
                        try:
                                values = self._parse_rule(rule)
-                               match = values[0].lower()
+                               match = values[0]
+                               r['regex'] = re.compile(values[0])
                                r['extension'] = values[1]
                                r['cmd'] = values[2]
                        except:
                                pass
 
+                       if not match and regex and extension:
+                               match = regex
+                               r['regex'] = re.compile(regex)
+                               r['extension'] = extension
+                               r['cmd'] = cmd
+
                        # Verify that the match string and file extension were retrieved.
                        # Only add the rule if it is a new one (first come, first served).
                        if match and r['extension'] and not self.extract_rules.has_key(match):
                                self.extract_rules[match] = {}
                                self.extract_rules[match]['cmd'] = r['cmd']
                                self.extract_rules[match]['extension'] = r['extension']
+                               self.extract_rules[match]['regex'] = r['regex']
                                # Once any rule is added, set self.enabled to True
                                self.enabled = True
 
+       def remove_rule(self, text):
+               '''
+               Remove all rules that match a specified text.
+
+               @text - The text to match against.
+
+               Returns the number of rules removed.
+               '''
+               i = 0
+
+               for key in self.extract_rules.keys():
+                       if self.extract_rules[key]['regex'].match(text):
+                               del self.extract_rules[key]
+                               i += 1
+               return i
+
+       def clear_rules(self):
+               '''
+               Deletes all extraction rules.
+
+               Returns None.
+               '''
+               self.extract_rules = {}
+               self.enabled = False
+
+       def get_rules(self):
+               '''
+               Returns a dictionary of all extraction rules.
+               '''
+               return self.extract_rules
+
        def enable_delayed_extract(self, tf=None):
                '''
                Enables / disables the delayed extraction feature.
@@ -142,6 +198,16 @@ class Extractor:
                                if self.verbose:
                                        raise Exception("Extractor.load_defaults failed to load file '%s': %s" % (extract_file, str(e)))
 
+       def output_directory(self, path):
+               '''
+               Set the output directory for extracted files.
+
+               @path - The extraction path.
+
+               Returns None.
+               '''
+               self.extract_path = path
+
        def cleanup_extracted_files(self, tf=None):
                '''
                Set the action to take after a file is extracted.
@@ -170,45 +236,57 @@ class Extractor:
 
                Returns the name of the extracted file (blank string if nothing was extracted).
                '''
+               fname = ''
                cleanup_extracted_fname = True
+               original_dir = os.getcwd()
 
-               rule = self._match(description)
-               if rule is not None:
-                       fname = self._dd(file_name, offset, size, rule['extension'], output_file_name=name)
-                       if rule['cmd']:
-
-                               # Many extraction utilities will extract the file to a new file, just without
-                               # the file extension (i.e., myfile.7z => myfile). If the presumed resulting
-                               # file name already exists before executing the extract command, do not attempt 
-                               # to clean it up even if its resulting file size is 0.
-                               if self.remove_after_execute:
-                                       extracted_fname = os.path.splitext(fname)[0]
-                                       if os.path.exists(extracted_fname):
-                                               cleanup_extracted_fname = False
-
-                               # Execute the specified command against the extracted file
-                               self._execute(rule['cmd'], fname)
+               if not os.path.exists(self.extract_path):
+                       os.mkdir(self.extract_path)
 
-                               # Only clean up files if remove_after_execute was specified                             
-                               if self.remove_after_execute:
-
-                                       # Remove the original file that we extracted
-                                       try:
-                                               os.unlink(fname)
-                                       except:
-                                               pass
+               file_path = os.path.realpath(file_name)
+               
+               if os.path.isfile(file_path):
+                       os.chdir(self.extract_path)
+
+                       rule = self._match(description)
+                       if rule is not None:
+                               fname = self._dd(file_path, offset, size, rule['extension'], output_file_name=name)
+                               if rule['cmd']:
+
+                                       # Many extraction utilities will extract the file to a new file, just without
+                                       # the file extension (i.e., myfile.7z -> myfile). If the presumed resulting
+                                       # file name already exists before executing the extract command, do not attempt 
+                                       # to clean it up even if its resulting file size is 0.
+                                       if self.remove_after_execute:
+                                               extracted_fname = os.path.splitext(fname)[0]
+                                               if os.path.exists(extracted_fname):
+                                                       cleanup_extracted_fname = False
+       
+                                       # Execute the specified command against the extracted file
+                                       self._execute(rule['cmd'], fname)
 
-                                       # If the command worked, assume it removed the file extension from the extracted file
+                                       # Only clean up files if remove_after_execute was specified                             
+                                       if self.remove_after_execute:
 
-                                       # If the extracted file name file exists and is empty, remove it
-                                       if cleanup_extracted_fname and os.path.exists(extracted_fname) and file_size(extracted_fname) == 0:
+                                               # Remove the original file that we extracted
                                                try:
-                                                       os.unlink(extracted_fname)
+                                                       os.unlink(fname)
                                                except:
                                                        pass
-               else:
-                       fname = ''
 
+                                               # If the command worked, assume it removed the file extension from the extracted file
+
+                                               # If the extracted file name file exists and is empty, remove it
+                                               if cleanup_extracted_fname and os.path.exists(extracted_fname) and file_size(extracted_fname) == 0:
+                                                       try:
+                                                               os.unlink(extracted_fname)
+                                                       except:
+                                                               pass
+       
+                               fname = os.path.join(self.extract_path, fname)
+
+                       os.chdir(original_dir)
+       
                return fname
 
        def delayed_extract(self, results, file_name, size):
@@ -280,7 +358,7 @@ class Extractor:
                description = description.lower()
 
                for (m, rule) in self.extract_rules.iteritems():
-                       if m in description:
+                       if rule['regex'].search(description):
                                return rule
                return None
 
@@ -306,23 +384,23 @@ class Extractor:
 
                Returns the extracted file name.
                '''
-               # Default extracted file name is <hex offset>.<extension>
-               altname = "%X.%s" % (offset, extension)
-               
+               total_size = 0
+
                if not output_file_name or output_file_name is None:
-                       fname = altname
+                       # Default extracted file name is <hex offset>.<extension>
+                       bname = "%X" % offset
                else:
-                       fname = "%s.%s" % (output_file_name, extension)
-       
-               # Sanitize output file name of invalid/dangerous characters (like file paths)   
-               fname = os.path.basename(fname)
+                       # Strip the output file name of invalid/dangerous characters (like file paths)  
+                       bname = os.path.basename(output_file_name)
+               
+               fname = unique_file_name(bname, extension)
 
                try:
                        # Open the target file and seek to the offset
                        fdin = open(file_name, "rb")
                        fdin.seek(offset)
                        
-                       # Open the extracted file
+                       # Open the output file
                        try:
                                fdout = open(fname, "wb")
                        except:
@@ -330,8 +408,14 @@ class Extractor:
                                fname = altname
                                fdout = open(fname, "wb")
 
-                       # Read size bytes from target file and write it to the extracted file
-                       fdout.write(fdin.read(size))
+                       # Read data from target file in chunks and write it to the extracted file
+                       while total_size < size:
+                               block_size = size - total_size
+                               if block_size > self.MAX_READ_SIZE:
+                                       block_size = self.MAX_READ_SIZE
+                       
+                               fdout.write(fdin.read(block_size))
+                               total_size += block_size
 
                        # Cleanup
                        fdout.close()
@@ -352,16 +436,19 @@ class Extractor:
                '''
                tmp = None
 
-               # If not in verbose mode, create a temporary file to redirect stdout and stderr to
-               if not self.verbose:
-                       tmp = tempfile.TemporaryFile()
-
                try:
-                       # Replace all instances of FILE_NAME_PLACEHOLDER in the command with fname
-                       cmd = cmd.replace(self.FILE_NAME_PLACEHOLDER, fname)
-
-                       # Execute.
-                       subprocess.call(shlex.split(cmd), stdout=tmp, stderr=tmp)
+                       if callable(cmd):
+                               cmd(fname)
+                       else:
+                               # If not in verbose mode, create a temporary file to redirect stdout and stderr to
+                               if not self.verbose:
+                                       tmp = tempfile.TemporaryFile()
+
+                               # Replace all instances of FILE_NAME_PLACEHOLDER in the command with fname
+                               cmd = cmd.replace(self.FILE_NAME_PLACEHOLDER, fname)
+       
+                               # Execute.
+                               subprocess.call(shlex.split(cmd), stdout=tmp, stderr=tmp)
                except Exception, e:
                        sys.stderr.write("WARNING: Extractor.execute failed to run '%s': %s\n" % (cmd, str(e)))