Source code for rowgenerators.appurl.archive.zip

# Copyright (c) 2017 Civic Knowledge. This file is licensed under the terms of the
# MIT, included in this distribution as LICENSE

""" """

from rowgenerators.appurl.file.file import FileUrl
from rowgenerators.exceptions import AppUrlError


class ZipUrlError(AppUrlError):
    pass


[docs]class ZipUrl(FileUrl): """Zip URLS represent a zip file, as a local resource. """ match_priority = FileUrl.match_priority - 10 def __init__(self, url=None, downloader=None, **kwargs): super().__init__(url, downloader=downloader, **kwargs) if self.resource_format != 'zip': self.resource_format = 'zip' @property def target_file(self): """ Returns the target file, which is usually stored in the first slot in the ``fragment``, but may have been overridden with a ``fragment_query``. :return: """ if self._parts['target_file']: return self._parts['target_file'] for ext in ('csv', 'xls', 'xlsx'): if self.resource_file.endswith('.' + ext + '.zip'): return self.resource_file.replace('.zip', '') # Want to return none, so get_files_from-zip can assume to use the first file in the archive. return None # Just a copy of the one from Url; looks like it must be reset because # the target_file prop was replaced @target_file.setter def target_file(self, v): self._parts['target_file'] = v
[docs] def join_target(self, tf): """ Joins the target ``tf`` by setting the value of the first slot of the fragment. :param tf: :return: a clone of this url with a new fragment. """ u = self.clone() try: u.target_file = str(tf.path) except AttributeError: u.target_file = tf return u
[docs] def get_resource(self): return self
@property def zip_dir(self): """Directory that files will be extracted to""" from os.path import abspath cache_dir = self.downloader.cache.getsyspath('/') target_path = abspath(self.fspath) if target_path.startswith(cache_dir): # Case when file is already in cache return str(self.fspath) + '_d' else: # file is not in cache; it may exist elsewhere. return self.downloader.cache.getsyspath(target_path.lstrip('/'))+'_d'
[docs] def get_target(self): """ Extract the target file from the archive, store it in the cache, and return a file Url to the cached file. """ from rowgenerators.appurl.url import parse_app_url from zipfile import ZipFile, BadZipFile import io from os.path import join, dirname from rowgenerators.appurl.util import copy_file_or_flo, ensure_dir assert self.zip_dir try: zf = ZipFile(str(self.fspath)) except BadZipFile: raise ZipUrlError(f"Not a zip file: {str(self.fspath)} for url {str(self)}") self.target_file = ZipUrl.get_file_from_zip(self) target_path = join(self.zip_dir, self.target_file) ensure_dir(dirname(target_path)) with io.open(target_path, 'wb') as f, zf.open(self.target_file) as flo: copy_file_or_flo(flo, f) fq = self.frag_dict if 'resource_format' in fq: del fq['resource_format'] if 'resource_file' in fq: del fq['resource_file'] if 'target_file' in fq: del fq['target_file'] tu = parse_app_url(target_path, scheme_extension=self.scheme_extension, downloader=self.downloader, **fq ) if self.target_format != tu.target_format: try: tu.target_format = self.target_format except AttributeError: pass # Some URLS don't allow resetting target type. return tu
[docs] def list(self): """List the files in the referenced Zip file""" from zipfile import ZipFile if self.target_file: return list(self.set_target_segment(tl.target_segment) for tl in self.get_target().list()) else: real_files = ZipUrl.real_files_in_zf(ZipFile(str(self.fspath))) return list(self.set_target_file(rf) for rf in real_files)
[docs] @staticmethod def get_file_from_zip(url): """Given a file name that may be a regular expression, return the full name for the file from a zip archive""" from rowgenerators.exceptions import AppUrlError from zipfile import ZipFile, BadZipFile import re names = [] try: zf = ZipFile(str(url.fspath)) except BadZipFile: raise AppUrlError(f"Bad zip file: '{str(url.fspath)}' ") nl = list(ZipUrl.real_files_in_zf(zf)) # Old way, but maybe gets links? : list(zf.namelist()) tf = url.target_file ts = url.target_segment if not nl: # sometimes real_files_in_zf doesn't work at all. I don't know why it does work, # so I certainly don't know why it does not. nl = list(zf.namelist()) # the target_file may be a string, or a regular expression if tf.startswith('*'): # Common user error using a glob instead of a regex tf = tf.replace('*','.*') if tf: try: names = list([e for e in nl if re.search(tf, e) and not (e.startswith('__') or e.startswith('.')) ]) except Exception as e: raise if len(names) > 0: return names[0] # The segment, if it exists, can only be an integer, and should probably be # '0' to indicate the first file. This clause is probably a bad idea, since # andy other integer is probably meaningless. if ts: try: return nl[int(ts)] except (IndexError, ValueError): pass # Just return the first file in the archive. if not tf and not ts: return nl[0] else: raise ZipUrlError("Could not find file in Zip {} for target='{}' nor segment='{}'" .format(url.fspath, url.target_file, url.target_segment))
[docs] @staticmethod def real_files_in_zf(zf): """Return a list of internal paths of real files in a zip file, based on the 'external_attr' values""" from os.path import basename for e in zf.infolist(): # Get rid of __MACOS and .DS_whatever if basename(e.filename).startswith('__') or basename(e.filename).startswith('.'): continue # I really don't understand external_attr, but no one else seems to either, # so we're just hacking here. # e.external_attr>>31&1 works when the archive has external attrs set, and a dir heirarchy # e.external_attr==0 works in cases where there are no external attrs set # e.external_attr==32 is true for some single-file archives. # e.external_attr==128 has been true at least once ... if bool(e.external_attr >> 31 & 1 or e.external_attr == 0 or e.external_attr == 32 or e.external_attr == 128): yield e.filename
@classmethod def _match(cls, url, **kwargs): return url.resource_format == 'zip' or kwargs.get('force_archive')