Source code for appurl.url

# Copyright (c) 2017 Civic Knowledge. This file is licensed under the terms of the
# MIT, included in this distribution as LICENSE

""" """

from os.path import basename
from urllib.parse import unquote

from .util import file_ext, parse_url_to_dict, unparse_url_dict

def match_url_classes(u_str, **kwargs):
    """
    Return the classes for which the url matches an entry_point specification, sorted by priority

    :param u_str: Url string
    :param kwargs: arguments passed to Url constructor
    :return:
    """

    from pkg_resources import iter_entry_points

    u = Url(str(u_str), downloader=None, **kwargs)

    try:
        classes = []

        for ep in iter_entry_points(group='appurl.urls'):
            if u._match_entry_point(ep.name):
                classes.append(ep.load())

        classes = sorted(classes, key=lambda cls: cls.match_priority)

    except ModuleNotFoundError as e:
        raise ModuleNotFoundError("Failed to find module for url string '{}', entrypoint: "
                                  .format(u_str, e))

    return classes

default_downloader = None

[docs]def parse_app_url(u_str, downloader='default', **kwargs):
    """
    Parse a URL string and return a Url object, with the class based on the highest priority
    entry point that matches the Url and which of the entry point classes pass the match() test.

    :param u_str: Url string
    :param downloader: Downloader object to use for downloading objects.
    :param kwargs: Args passed to the Url constructor.
    :return:
    """
    from rowgenerators.appurl.web.download import Downloader
    from rowgenerators.exceptions import AppUrlError

    if not u_str:
        return None

    if isinstance(u_str, Url):
        return u_str

    if not isinstance(u_str, str):
        raise AppUrlError("Input isn't a string nor Url")

    if downloader == 'default':
        global default_downloader
        if default_downloader is None:
            default_downloader = Downloader.get_instance()

        downloader = default_downloader

    classes = match_url_classes(u_str, **kwargs)

    u = Url(str(u_str), downloader=None, **kwargs)

    for cls in classes:
        if cls._match(u):
            return cls(str(u_str) if u_str else None, downloader=downloader, **kwargs)


class UrlPartsProp(object):
    """Property descriptor for reading and writting to the _parts dict
    in UrlParts"""
    def __init__(self, name):
        self.name = name

    def __get__(self, obj, objtype):
        return obj._parts.get(self.name)

    def __set__(self, obj, value):
        if value is None and self.name in obj._parts:
            del obj._parts[self.name]
        else:
            obj._parts[self.name] = value

    def __delete__(self, obj):
       del obj._parts[self.name]

class UrlParts(object):
    """Container class for handling property accessors"""

    _url_parts = ['proto', 'scheme_extension', 'scheme',
                 'netloc', 'hostname',
                 'username', 'password', 'port',
                 'path', 'query', 'fragment', 'fragment_query']

    _app_parts = ['resource_file', 'resource_format',
                 'target_file', 'target_format', 'target_segment']

    _fragment_query_parts = ['start','end','headers','encoding',
                             'resource_file','resource_format','target_format']

    _fragment_segments_parts = ['target_file','target_segment']

    _all_parts = set(_url_parts+_app_parts + _fragment_query_parts + _fragment_segments_parts )

    # Add extra fragment parts here.
    _extra_fragement_props = []

    def __init__(self, url, **kwargs):

        self._url = url
        self._kwargs = kwargs

        if self._url:
            self._parts = parse_url_to_dict(self._url)
        else:
            self._parts = {}

        self._convert_fragment()
        self._convert_fragment_query()

        self._parts.update(kwargs)

    def _convert_fragment(self):

        if 'fragment' in self._parts and isinstance(self._parts['fragment'], (list, tuple)):
            if len(self._parts['fragment']) == 1:
                self._parts['target_file'] = self._parts['fragment'][0]
            elif len(self._parts['fragment']) == 2:
                self._parts['target_file'], self._parts['target_segment'] = self._parts['fragment']

            del self._parts['fragment']

    def _convert_fragment_query(self):

        if isinstance(self._parts.get('fragment_query'), dict):

            for k, v in list(self._parts['fragment_query'].items()):
                if k in self._fragment_query_parts:
                    self._parts[k] = self._parts['fragment_query'][k]
                    del self._parts['fragment_query'][k]

    scheme = UrlPartsProp('scheme')
    scheme_extension = UrlPartsProp('scheme_extension')
    netloc = UrlPartsProp('netloc')
    hostname = UrlPartsProp('hostname')
    username = UrlPartsProp('username')
    password = UrlPartsProp('password')
    port = UrlPartsProp('port')
    path = UrlPartsProp('path')
    query = UrlPartsProp('query')
    target_segment = UrlPartsProp('target_segment')
    start = UrlPartsProp('start')
    end = UrlPartsProp('end')
    headers = UrlPartsProp('headers')
    encoding = UrlPartsProp('encoding')

    fragment_query = UrlPartsProp('fragment_query')

    @property
    def proto(self):
        return self._parts.get('proto') or \
               self._parts['scheme_extension'] or \
               {'https': 'http', '': 'file'}.get(self._parts['scheme']) or \
               self._parts['scheme']

    @proto.setter
    def proto(self,v):
        self._parts['proto'] = v

    @property
    def target_format(self):
        from .util import file_ext

        target_format = self._parts.get('target_format')

        if not target_format and self.target_file:
            target_format = file_ext(self.target_file)

        if not target_format:
            target_format = self.resource_format

        # handle URLS that end with package names, like:
        # 'example.com-example_data_package-2017-us-1'
        if target_format and len(target_format) > 8:
            target_format = None

        return target_format

    @target_format.setter
    def target_format(self, v):
        self._parts['target_format'] = v



    def clear_fragment(self):
        """
        Return a copy of the URL with no fragment components

        :return: A cloned URl object, with the fragment and fragment queries cleared.
        """

        c = self.clone()
        c._parts['target_file'] = None
        c._parts['target_segment'] = None

        return c

    #
    # Property accessors
    #

    def set_fragment(self, f):
        """Return a clone with the fragment set"""
        raise NotImplementedError()

    @property
    def resource_file(self):
        if self.path:
            return basename(self.path)
        else:
            return None

    @property
    def resource_format(self):
        return self._parts.get('resource_format') or file_ext(self.resource_file)

    @resource_format.setter
    def resource_format(self, v):
        self._parts['resource_format'] = v

    @property
    def target_file(self):

        return self._parts.get('target_file') or self.resource_file

    @target_file.setter
    def target_file(self, v):
        self._parts['target_file'] = v


    def set_target_file(self, v):
        """Return a clone with a target_file set"""
        u = self.clone()
        u.target_file = v
        return u

    def set_target_segment(self, v):
        """Return a clone with a target_file set"""
        u = self.clone()
        u.target_segment = v
        return u

    @property
    def dict(self):
        """
        Returns a dictionary of the object components.

        :return: a dict.
        """

        d = dict(self._parts.items())

        d['scheme_extension'] = self._parts.get('proto') or d.get('scheme_extension')

        for k, v in list(d.items()):
            if k in (self._fragment_query_parts + self._fragment_segments_parts):
                if not v:
                    del d[k]

        d['fragment'] = [
            self._parts.get('target_file'),
            self._parts.get('target_segment')
        ]

        for k in self._fragment_query_parts:
            if k in d:
                d['fragment_query'][k] = d[k]
                del d[k]

        return d

    @property
    def frag_dict(self):
        d = {}
        for k in self._fragment_segments_parts + self._fragment_query_parts:
            d[k] = self._parts.get(k)

        return d

    def __str__(self):

        return unparse_url_dict(self.dict)




[docs]class Url(UrlParts):
    """Base class for Application URLs .

    After construction, a Url object has a set of properties and attributes for access
    the parts of the URL, and method for manipulating it. The attributes and properties
    include the typical properties of a parsed URL, plus properties that are derives from the
    typical parts, and a few extra components that can be part of the fragment query.

    The typical parts are:

    - ``scheme``
    - ``scheme_extension``
    - ``netloc``
    - ``hostname``
    - ``path``
    - ``params``
    - ``query``
    - ``fragment``
    - ``username``
    - ``password``
    - ``port``

    The ``fragment`` is special; it is an array of two elements, the first of which is the ``target_file`` and
    and the second is the ``target_segment``. If there are other parts of the source URL, they must be
    formates as queriy components, and will be parsed into the ``fragment_query``.

    Special application components are:

    - ``proto``. This is set to the ``scheme_extension`` if it exists, the scheme otherwise.
    - ``resource_file``. The filename of the resource to download. It is usually the last part of the URL, but can be overidden in the fragment
    - ``resource_format``. The format name of the resource, normally drawn from the ``resoruce_file`` extension, but can be overidden in the fragment
    - ``target_file``. The filename of the file that will be produced by :py:meth`Url.get_target`, but may be overidden.
    - ``target_format``. The format of the ``target_file``, but may be overidden.
    - ``target_segment``. A sub-component of the ```target_file``, such as the worksheet in a spreadsheet.
    - ``fragment_query``. Holds additional parts of the fragment.

    When the fragment holds extra parts, these can be be formatted as a URL query. Recognized keys are:

    - ``resource_file``
    - ``resource_format``
    - ``target_file``
    - ``target_format``
    - ``encoding``. Text encoding to be used when reading the target.
    - ``headers``. For row-oriented data, the row numbers of the headers, as a comma-seperated list of integers.
    - ``start``. For row-oriented data, the row number of the first row of data ( as opposed to headers. )
    - ``end``. For row-oriented data, the row number of the last row of data.

    """

    match_priority = 100
    match_proto = None
    generator_class = None  # If set, generators match with name = <{generator_class}>

    def __init__(self, url=None, downloader=None, **kwargs):
        """  Initialize a new Application Url
        :param url: URL string
        :param downloader: :py:class:`appurl.web.download.Downloader` object.
        :param kwargs: Additional arguments override URL properties.
        :return: An Application Url object

        Keyword arguments will override properties set by parsing the URL string.

        """

        self._kwargs = kwargs
        self._downloader = downloader

        super().__init__(url, **kwargs)

        assert 'is_archive' not in self._kwargs #?

[docs]    def resolve(self):
        """Resolve a URL to another format, such as by looking up a URL that specified a
        search, into another URL. The default implementation returns self. """
        return self

[docs]    def get_resource(self):
        """Get the contents of resource and save it to the cache, returning a file-like object"""
        raise NotImplementedError(("get_resource not implemented in {} for '{}'. "
                                   "You may need to install a python mpdule for this type of url")
                                  .format(self.__class__.__name__, str(self)))

[docs]    def get_target(self):
        """Get the contents of the target, and save it to the cache, returning a file-like object
        """
        raise NotImplementedError(("get_target not implemented in {} for '{}'"
                                   "You may need to install a python module for this type of url"
                                   )
                                  .format(self.__class__.__name__, str(self)))

    @property
    def downloader(self):
        """Return the Downloader() for this URL"""
        return self._downloader

[docs]    def list(self):
        """Return URLS for files contained in an container. This implementation just returns
        ``[self]``, but sub classes may, for instance, list all of the sub-components of a directory,
        or all of the worksheets in an Excel file. """
        return [self]

    @property
    def is_archive(self):
        """Return true if this URL is for an archive. Currently only ZIP is recognized"""
        return self.resource_format in self.archive_formats

    # property
[docs]    def archive_file(self):
        """Return the name of the archive file, if there is one."""
        return self.target_file if self.is_archive and self.resource_file != self.target_file else None

    @property
    def fspath(self):
        """The path in a form suitable for use in a filesystem"""
        from pathlib import PurePath
        return PurePath(unquote(self.path))

    @property
    def path_is_absolute(self):
        return self.path.startswith('/')

[docs]    def join(self, s):
        """ Join a component to the end of the path, using :func:`os.path.join`. The argument
        ``s`` may be a :class:`appurl.Url` or a string. If ``s`` includes a ``netloc`` property,
        it is assumed to be an absolute url, and it is returned after parsing as a Url. Otherwise,
        the path component of ``s`` is extracted and joined to the path component of this url.

        :param s: A Url object, or a string.
        :return: A copy of this url.
        """

        from copy import copy
        import pathlib

        try:
            path = s.path
            netloc = s.netloc
            u = s
        except AttributeError:
            u = parse_app_url(s, downloader=self.downloader)
            path = u.path
            netloc = u.netloc

        # If there is a netloc, it's an absolute URL
        if netloc:
            return u

        url = copy(self)

        # Using pathlib.PurePosixPath ensures using '/' on windows. os.path.join will use '\'
        url.path = str(pathlib.PurePosixPath(self.path).joinpath(path))

        return url

[docs]    def join_dir(self, s):
        """ Join a component to the parent directory of the path, using join(dirname())

        :param s:
        :return: a copy of this url.
        """

        from os.path import dirname
        from copy import copy
        import pathlib

        try:
            path = s.path
            netloc = s.netloc
            u = s
        except AttributeError:
            u = parse_app_url(s, downloader=self.downloader)
            path = u.path
            netloc = u.netloc

        # If there is a netloc, it's an absolute URL
        if netloc:
            return u

        url = copy(self)
        # Using pathlib.PurePosixPath ensures using '/' on windows. os.path.join will use '\'
        url.path = str(pathlib.PurePosixPath(dirname(self.path)).joinpath(path))

        return url

[docs]    def join_target(self, tf):
        """Return a new URL, possibly of a new class, with a new target_file"""

        raise NotImplementedError("Not implemented in '{}' ".format(type(self)))

    @property
    def inner(self):
        """Return the URL without the scheme extension and fragment. Re-parses the URL, so it should return
        the correct class for the inner URL. """
        if not self.scheme_extension:
            return self

        c = self.clone(scheme_extension=None, proto=None)

        return parse_app_url(str(c), downloader=self.downloader)


    @property
    def resource_url(self):

        return unparse_url_dict(self.dict,
                                scheme=self.scheme if self.scheme else 'file',
                                scheme_extension=False,
                                fragment_query=False,
                                fragment=False)

[docs]    def dirname(self):
        """Return the dirname of the path"""
        from os.path import dirname

        u = self.clone()
        u.path = dirname(self.path)
        return u



[docs]    def as_type(self, cls):
        """
        Return the URL transformed to a different class. Copies the downloader and
        build the new url using :py:meth:`Url.dict`

        :param cls: Class of Url to construct
        :return: A new Url object
        """

        return cls(downloader=self.downloader, **self.dict)

[docs]    def interpolate(self, context=None):
        """
        Use the Downloader.context to interpolate format strings in the URL. Re-parses the URL,
         returning a new URL

        :param context: Extra context to interpolate with
        :return:
        """

        from copy import copy

        cxt = copy(self.downloader.context)

        cxt.update(context or {})

        from rowgenerators.exceptions import AppUrlError

        try:
            return parse_app_url(str(self).format(**cxt), downloader=self.downloader)
        except KeyError as e:
            raise AppUrlError("Failed to interpolate '{}'; context is {}. Missing key: {} "
                              .format(str(self), self.downloader.context, e))

[docs]    def clone(self, **kwargs):
        """
        Return a clone of this Url, possibly with some arguments replaced.

        :param kwargs: Keyword arguments are arguments to set in the copy, using :py:func:`setattr`
        :return: A cloned Url object.
        """
        from copy import deepcopy

        c = deepcopy(self)

        for k, v in kwargs.items():
            try:
                setattr(c, k, v)

            except AttributeError:
                raise AttributeError("Can't set attribute '{}' on '{}' ".format(k, c))

        return c

    @property
    def generator(self):
        """
        Return the generator for this URL, if the rowgenerator package is installed.

        :return: A row generator object.
        """

        from rowgenerators.core import get_generator

        r = self.get_resource()
        t = r.get_target()

        return get_generator(t.get_target(), source_url=self)


    #
    # Matching methods
    #

    def _match_entry_point(self, name):
        """Return true if this URL matches the entrypoint pattern

        Entrypoint patterns:

            'scheme:' Match the URL scheme
            'proto+' Matches the protocol / scheme_extension
            '.ext' Match the resource extension
            '#.ext' Match the target extension
        """

        import re

        if '&' in name:
            return all(self._match_entry_point(n) for n in name.split('&'))

        try:
            name = name.name  # Maybe it's an entrypoint entry, not the name
        except AttributeError:
            pass

        if name == '*':
            return True
        elif name.startswith("/") and name.endswith("/"):
            return re.search(name[1:-1], str(self))
        elif name.endswith(":"):
            return name[:-1] == self.scheme
        elif name.endswith('+'):
            return name[:-1] == self.proto
        elif name.startswith('.'):
            return name[1:] == self.resource_format
        elif name.startswith('#.'):
            return name[2:] == self.target_format
        else:
            return False

    @classmethod
    def _match(cls, url, **kwargs):
        """Return True if this handler can handle the input URL"""
        if cls.match_proto:
            return url.proto == cls.match_proto
        else:
            return True;  # raise NotImplementedError("Match is not implemented for class '{}' ".format(str(cls)))

    #
    # Other support methods
    #


    def __deepcopy__(self, memo):
        return type(self)(None, downloader=self._downloader, **self._parts)

    def __copy__(self):
        return type(self)(None, downloader=self._downloader, **self._parts)

    def _decompose_fragment(self, frag):
        """Parse the fragment component"""
        from urllib.parse import unquote_plus

        if isinstance(frag, (list, tuple)):
            assert frag[0] is None or isinstance(frag[0], str), (frag[0], type(frag[0]))
            return frag

        if not frag:
            return None, None

        frag_parts = unquote_plus(frag).split(';')

        if not frag_parts:
            file, segment = None, None
        elif len(frag_parts) == 1:
            file = frag_parts[0]
            segment = None
        elif len(frag_parts) >= 2:
            file = frag_parts[0]
            segment = frag_parts[1]

        assert file is None or isinstance(file, str), (file, type(file))

        return file, segment

    def __repr__(self):
        return "<{} {}>".format(self.__class__.__name__, str(self))
Source code for appurl.url

Row Generators

Navigation

Related Topics