Source code for appurl.url

# Copyright (c) 2017 Civic Knowledge. This file is licensed under the terms of the
# MIT, included in this distribution as LICENSE

""" """


def match_url_classes(u_str, **kwargs):
    """
    Return the classes for which the url matches an entry_point specification, sorted by priority

    :param u_str: Url string
    :param kwargs: arguments passed to Url constructor
    :return:
    """

    from pkg_resources import iter_entry_points

    u = Url(str(u_str), downloader=None, **kwargs)

    try:
        classes = sorted([ep.load() for ep in iter_entry_points(group='appurl.urls') if u._match_entry_point(ep.name)],
                         key=lambda cls: cls.match_priority)
    except ModuleNotFoundError as e:
        raise ModuleNotFoundError("Failed to find module for url string '{}', entrypoint: "
                                  .format(u_str, e))

    return classes


default_downloader = None


[docs]def parse_app_url(u_str, downloader='default', **kwargs):
    """
    Parse a URL string and return a Url object, with the class based on the highest priority
    entry point that matches the Url and which of the entry point classes pass the match() test.

    :param u_str: Url string
    :param downloader: Downloader object to use for downloading objects.
    :param kwargs: Args passed to the Url constructor.
    :return:
    """
    from rowgenerators.appurl.web.download import Downloader
    from rowgenerators.exceptions import AppUrlError

    if not u_str:
        return None

    if isinstance(u_str, Url):
        return u_str

    if not isinstance(u_str, str):
        raise AppUrlError("Input isn't a string nor Url")

    if downloader == 'default':
        global default_downloader
        if default_downloader is None:
            default_downloader = Downloader()

        downloader = default_downloader

    classes = match_url_classes(u_str, **kwargs)

    u = Url(str(u_str), downloader=None, **kwargs)

    for cls in classes:
        if cls._match(u):
            return cls(str(u_str) if u_str else None, downloader=downloader, **kwargs)


[docs]class Url(object):
    """Base class for Application URLs .

    After construction, a Url object has a set of properties and attributes for access
    the parts of the URL, and method for manipulating it. The attributes and properties
    include the typical properties of a parsed URL, plus properties that are derives from the
    typical parts, and a few extra components that can be part of the fragment query.

    The typical parts are:

    - ``scheme``
    - ``scheme_extension``
    - ``netloc``
    - ``hostname``
    - ``path``
    - ``params``
    - ``query``
    - ``fragment``
    - ``username``
    - ``password``
    - ``port``

    The ``fragment`` is special; it is an array of two elements, the first of which is the ``target_file`` and
    and the second is the ``target_segment``. If there are other parts of the source URL, they must be
    formates as queriy components, and will be parsed into the ``fragment_query``.

    Special application components are:

    - ``proto``. This is set to the ``scheme_extension`` if it exists, the scheme otherwise.
    - ``resource_file``. The filename of the resource to download. It is usually the last part of the URL, but can be overidden in the fragment
    - ``resource_format``. The format name of the resource, normally drawn from the ``resoruce_file`` extension, but can be overidden in the fragment
    - ``target_file``. The filename of the file that will be produced by :py:meth`Url.get_target`, but may be overidden.
    - ``target_format``. The format of the ``target_file``, but may be overidden.
    - ``target_segment``. A sub-component of the ```target_file``, such as the worksheet in a spreadsheet.
    - ``fragment_query``. Holds additional parts of the fragment.

    When the fragment holds extra parts, these can be be formatted as a URL query. Recognized keys are:

    - ``resource_file``
    - ``resource_format``
    - ``target_file``
    - ``target_format``
    - ``encoding``. Text encoding to be used when reading the target.
    - ``headers``. For row-oriented data, the row numbers of the headers, as a comma-seperated list of integers.
    - ``start``. For row-oriented data, the row number of the first row of data ( as opposed to headers. )
    - ``end``. For row-oriented data, the row number of the last row of data.

    """

    # Basic URL components
    scheme = None
    scheme_extension = None
    netloc = None
    hostname = None
    _path = None
    params = None
    query = None
    fragment = [None, None]
    fragment_query = {}
    username = None
    password = None
    port = None

    # Application components
    _proto = None
    _resource_file = None
    _resource_format = None
    _target_file = None
    _target_format = None
    _target_segment = None

    encoding = None  # target encoding
    headers = None  # line number of headers
    start = None  # start line for data
    end = None  # end line for data

    match_priority = 100
    match_proto = None
    generator_class = None  # If set, generators match with name = <{generator_class}>

    def __init__(self, url=None, downloader=None, **kwargs):
        """  Initialize a new Application Url
        :param url: URL string
        :param downloader: :py:class:`appurl.web.download.Downloader` object.
        :param kwargs: Additional arguments override URL properties.
        :return: An Application Url object


        Keyword arguments will override properties set by parsing the URL string. Valid keywords
        that will set object properties are listed below. Other keyswords are accepted and ignored

        - scheme
        - scheme_extension
        - netloc
        - hostname
        - path
        - params
        - fragment
        - fragment_query
        - username
        - password
        - port

        """

        from .util import parse_url_to_dict

        assert 'is_archive' not in kwargs

        self._kwargs = kwargs

        if url is not None:

            parts = parse_url_to_dict(url)

            for k, v in parts.items():
                try:
                    # print(" {}: '{}' ".format(k,v))
                    setattr(self, k, v)
                except AttributeError:
                    print("Can't Set: ", k, v)

        else:
            for k in "scheme scheme_extension netloc hostname path params query fragment fragment_query username " \
                     "password port".split():

                if k == 'fragment_query' and kwargs.get(k) is None:  # Probably trying to set it to Null
                    setattr(self, k, {})
                else:
                    v =  kwargs.get(k)
                    if isinstance(v, str):
                        v = v.strip()

                    setattr(self, k, v)


        self.fragment_query = kwargs.get('fragment_query', self.fragment_query or {})

        self._fragment = self._decompose_fragment(kwargs.get('fragment', self.fragment))

        assert self._fragment[0] is None or isinstance(self._fragment[0], str), type(self._fragment[0])

        if not self._fragment:
            self._fragment = [None, None]


        self.scheme_extension = kwargs.get('scheme_extension', self.scheme_extension)

        self.scheme = kwargs.get('scheme', self.scheme)

        self._proto = kwargs.get('proto', self.proto)
        self._resource_file = kwargs.get('resource_file')
        self._resource_format = kwargs.get('resource_format', self.fragment_query.get('resource_format'))
        self._target_format = kwargs.get('target_format', self.fragment_query.get('target_format'))
        self._target_segment = kwargs.get('target_segment')

        self.encoding = kwargs.get('encoding', self.fragment_query.get('encoding', self.encoding))
        self.headers = kwargs.get('headers', self.fragment_query.get('headers', self.headers))
        self.start = kwargs.get('start', self.fragment_query.get('start', self.start))
        self.end = kwargs.get('end', self.fragment_query.get('end', self.end))

        try:
            self._target_format = self._target_format.lower()
        except AttributeError:
            pass

        self._downloader = downloader

[docs]    def resolve(self):
        """Resolve a URL to another format, such as by looking up a URL that specified a
        search, into another URL. The default implementation returns self. """
        return self

[docs]    def get_resource(self):
        """Get the contents of resource and save it to the cache, returning a file-like object"""
        raise NotImplementedError(("get_resource not implemented in {} for '{}'. "
                                   "You may need to install a python mpdule for this type of url")
                                  .format(self.__class__.__name__, str(self)))

[docs]    def get_target(self):
        """Get the contents of the target, and save it to the cache, returning a file-like object
        """
        raise NotImplementedError(("get_target not implemented in {} for '{}'"
                                   "You may need to install a python mpdule for this type of url"
                                   )
                                  .format(self.__class__.__name__, str(self)))

    @property
    def downloader(self):
        """Return the Downloader() for this URL"""
        return self._downloader

[docs]    def list(self):
        """Return URLS for files contained in an container. This implementation just returns
        ``[self]``, but sub classes may, for instance, list all of the sub-components of a directory,
        or all of the worksheets in an Excel file. """
        return [self]

    @property
    def is_archive(self):
        """Return true if this URL is for an archive. Currently only ZIP is recognized"""
        return self.resource_format in self.archive_formats

    # property
[docs]    def archive_file(self):
        """Return the name of the archive file, if there is one."""
        return self.target_file if self.is_archive and self.resource_file != self.target_file else None


    @property
    def path(self):
        return self._path

    @path.setter
    def path(self,v):
        self._path = v

    @property
    def fspath(self):
        """The path in a form suitable for use in a filesystem"""
        return self.path


[docs]    def join(self, s):
        """ Join a component to the end of the path, using :func:`os.path.join`. The argument
        ``s`` may be a :class:`appurl.Url` or a string. If ``s`` includes a ``netloc`` property,
        it is assumed to be an absolute url, and it is returned after parsing as a Url. Otherwise,
        the path component of ``s`` is extracted and joined to the path component of this url.

        :param s: A Url object, or a string.
        :return: A copy of this url.
        """

        from copy import copy
        import pathlib

        try:
            path = s.path
            netloc = s.netloc
            u = s
        except AttributeError:
            u = parse_app_url(s, downloader=self.downloader)
            path = u.path
            netloc = u.netloc

        # If there is a netloc, it's an absolute URL
        if netloc:
            return u

        url = copy(self)

        # Using pathlib.PurePosixPath ensures using '/' on windows. os.path.join will use '\'
        url.path = str(pathlib.PurePosixPath(self.path).joinpath(path))

        return url

[docs]    def join_dir(self, s):
        """ Join a component to the parent directory of the path, using join(dirname())

        :param s:
        :return: a copy of this url.
        """

        from os.path import join, dirname
        from copy import copy
        import pathlib

        try:
            path = s.path
            netloc = s.netloc
            u = s
        except AttributeError:
            u = parse_app_url(s, downloader=self.downloader)
            path = u.path
            netloc = u.netloc

        # If there is a netloc, it's an absolute URL
        if netloc:
            return u

        url = copy(self)
        # Using pathlib.PurePosixPath ensures using '/' on windows. os.path.join will use '\'
        url.path = str(pathlib.PurePosixPath(dirname(self.path)).joinpath(path))

        return url

[docs]    def join_target(self, tf):
        """Return a new URL, possibly of a new class, with a new target_file"""

        raise NotImplementedError("Not implemented in '{}' ".format(type(self)))

    @property
    def inner(self):
        """Return the URL without the scheme extension and fragment. Re-parses the URL, so it should return
        the correct class for the inner URL. """
        if not self.scheme_extension:
            return self

        return parse_app_url(str(self.clone(scheme_extension=None)), downloader=self.downloader)

[docs]    def dirname(self):
        """Return the dirname of the path"""
        from os.path import dirname

        u = self.clone()
        u.path = dirname(self.path)
        return u

[docs]    def clear_fragment(self):
        """
        Return a copy of the URL with no fragment components

        :return: A cloned URl object, with the fragment and fragment queries cleared.
        """

        c = self.clone()
        c.fragment = [None, None]
        c.fragment_query = {}
        c.encoding = None
        c.start = None
        c.end = None
        c.headers = None
        return c

[docs]    def as_type(self, cls):
        """
        Return the URL transformed to a different class. Copies the downloader and
        build the new url using :py:meth:`Url.dict`

        :param cls: Class of Url to construct
        :return: A new Url object
        """

        return cls(downloader=self.downloader, **self.dict)



    @property
    def dict(self):
        """
        Returns a dictionary of the object components.

        :return: a dict.
        """
        self._update_parts()
        keys = "scheme scheme_extension netloc hostname path params query _fragment fragment_query username password " \
               "port proto  resource_format  target_format " \
               "encoding target_segment".split()

        d = dict((k, getattr(self,k)) for k in keys)


        return d

[docs]    def interpolate(self, context=None):
        """
        Use the Downloader.context to interpolate format strings in the URL. Re-parses the URL,
         returning a new URL

        :param context: Extra context to interpolate with
        :return:
        """

        from copy import copy

        cxt = copy(self.downloader.context)

        cxt.update(context or {})

        from rowgenerators.exceptions import AppUrlError

        try:
            return parse_app_url(str(self).format(**cxt), downloader=self.downloader)
        except KeyError as e:
            raise AppUrlError("Failed to interpolate '{}'; context is {}. Missing key: {} "
                              .format(str(self), self.downloader.context, e))

[docs]    def clone(self, **kwargs):
        """
        Return a clone of this Url, popssibly with some arguments replaced.

        :param kwargs: Keyword arguments are arguments to set in the copy, using :py:func:`setattr`
        :return: A cloned Url object.
        """

        d = self.dict.copy()
        c = type(self)(None, downloader=self._downloader, **d)
        c._kwargs = self._kwargs
        c.fragment = self.fragment

        c._update_parts()

        for k, v in kwargs.items():
            try:
                setattr(c, k, v)
            except AttributeError:
                raise AttributeError("Can't set attribute '{}' on '{}' ".format(k, c))

        return c

    @property
    def generator(self):
        """
        Return the generator for this URL, if the rowgenerator package is installed.

        :return: A row generator object.
        """

        from rowgenerators.core import get_generator

        r = self.get_resource()
        t = r.get_target()

        return get_generator(t.get_target(), source_url=self)

    #
    # Property accessors
    #

    @property
    def fragment(self):
        return self._fragment

    @fragment.setter
    def fragment(self, v):
        """Set the fragment in place"""
        assert isinstance(v, (list, tuple, type(None), str)), v

        if isinstance(v, str):
            # One string is the target_file
            self._fragment = [v, None]
        elif isinstance(v, (list, tuple)):
            self._fragment = list(v)
        else:
            self._fragment = [None, None]

[docs]    def set_fragment(self, f):
        """Return a clone with the fragment set"""
        u = self.clone()
        u.fragment = f
        return u

    @property
    def proto(self):
        return self._proto or \
               self.scheme_extension or \
               {'https': 'http', '': 'file'}.get(self.scheme) or \
               self.scheme

    @property
    def resource_url(self):
        from .util import unparse_url_dict

        return unparse_url_dict(self.dict,
                                scheme=self.scheme if self.scheme else 'file',
                                scheme_extension=False,
                                fragment_query=False,
                                fragment=False)

    @property
    def resource_file(self):

        from os.path import basename

        if self.path:
            return basename(self.path)
        else:
            return None

    @property
    def resource_format(self):

        from .util import file_ext

        if self._resource_format:
            return self._resource_format
        elif not self.resource_file:
            return None
        else:
            return file_ext(self.resource_file)

    @property
    def target_file(self):

        if self._target_file:
            return self._target_file

        try:
            if self.fragment[0]:
                return self.fragment[0]
        except IndexError:
            pass



        return self.resource_file

    @target_file.setter
    def target_file(self, v):
        self.fragment[0] = v

[docs]    def set_target_file(self, v):
        """Return a clone with a target_file set"""
        u = self.clone()
        u.fragment[0] = v
        return u

    @property
    def target_segment(self):
        if self.fragment:
            return self.fragment[1]
        else:
            return None

    @target_segment.setter
    def target_segment(self, v):
        self.fragment[1] = v

[docs]    def set_target_segment(self, v):
        """Return a clone with a target_file set"""
        u = self.clone()
        u.fragment[1] = v
        return u

    @property
    def target_format(self):
        from .util import file_ext

        target_format = None

        if self._target_format:
            target_format = self._target_format

        if not target_format and self.target_file:
            target_format = file_ext(self.target_file)

        if not target_format:
            target_format = self.resource_format

        # handle URLS that end with package names, like:
        # 'example.com-example_data_package-2017-us-1'
        if target_format and len(target_format) > 8:
            target_format = None

        return target_format

    @target_format.setter
    def target_format(self, target_format):
        self._target_format = target_format

    #
    # Matching methods
    #

    def _match_entry_point(self, name):
        """Return true if this URL matches the entrypoint pattern

        Entrypoint patterns:

            'scheme:' Match the URL scheme
            'proto+' Matches the protocol / scheme_extension
            '.ext' Match the resource extension
            '#.ext' Match the target extension
        """

        if '&' in name:
            return all(self._match_entry_point(n) for n in name.split('&'))

        try:
            name = name.name  # Maybe it's an entrypoint entry, not the name
        except AttributeError:
            pass

        if name == '*':
            return True
        elif name.endswith(":"):
            return name[:-1] == self.scheme
        elif name.endswith('+'):
            return name[:-1] == self.proto
        elif name.startswith('.'):
            return name[1:] == self.resource_format
        elif name.startswith('#.'):
            return name[2:] == self.target_format
        else:
            return False

    @classmethod
    def _match(cls, url, **kwargs):
        """Return True if this handler can handle the input URL"""
        if cls.match_proto:
            return url.proto == cls.match_proto
        else:
            return True;  # raise NotImplementedError("Match is not implemented for class '{}' ".format(str(cls)))

    #
    # Other support methods
    #

    def _update_parts(self):
        """Update the fragement_query. Set the attribute for the query value to False to delete it from
        the fragment query"""

        for k in "encoding headers start end".split():
            if getattr(self, k):
                self.fragment_query[k] = getattr(self, k)
            elif getattr(self, k) == False and k in self.fragment_query:
                del self.fragment_query[k]

                # if self.fragment:
                #    self.rebuild_fragment()

    def __deepcopy__(self, memo):
        d = self.dict.copy()
        d.update(self._kwargs)
        return type(self)(None, downloader=self._downloader, **d)

    def __copy__(self):
        d = self.dict.copy()
        d.update(self._kwargs)
        return type(self)(None, downloader=self._downloader, **d)

    def _decompose_fragment(self, frag):
        """Parse the fragment component"""
        from urllib.parse import unquote_plus

        if isinstance(frag, (list, tuple)):
            assert frag[0] is None or isinstance(frag[0], str), (frag[0], type(frag[0]))
            return frag

        if not frag:
            return None, None

        frag_parts = unquote_plus(frag).split(';')

        if not frag_parts:
            file, segment = None, None
        elif len(frag_parts) == 1:
            file = frag_parts[0]
            segment = None
        elif len(frag_parts) >= 2:
            file = frag_parts[0]
            segment = frag_parts[1]

        assert file is None or isinstance(file, str), (file, type(file))

        return file, segment

    def __repr__(self):
        return "<{} {}>".format(self.__class__.__name__, str(self))

    def __str__(self):

        from .util import unparse_url_dict

        self._update_parts()
        return unparse_url_dict(self.dict)
Source code for appurl.url

Row Generators

Navigation

Related Topics