Module hxl.converters

Data-conversion classes

This module holds classes for converting to HXL from other formats, or from HXL to other formats. Current, the only class is Tagger (for adding tags to non-HXL tabular data on the fly), but we may add more converters, especially for formats like GeoJSON.

Author

David Megginson

License

Public Domain

Expand source code
"""Data-conversion classes

This module holds classes for converting to HXL from other formats, or
from HXL to other formats. Current, the only class is ``Tagger`` (for
adding tags to non-HXL tabular data on the fly), but we may add more
converters, especially for formats like GeoJSON.

Author:
    David Megginson

License:
    Public Domain

"""

import hxl
import logging, re


logger = logging.getLogger(__name__)


class Tagger(hxl.input.AbstractInput):
    """Add HXL hashtags to a non-HXL datasource on the fly.

    Example:
    ```
    input = hxl.input.make_input(url_or_filename)
    specs = [('Cluster', '#sector'), ('Organi', '#org'), ('province', '#adm1+es')]
    dataset = hxl.converters.Tagger(input, specs)
    ```

    The more-common way to invoke the tagger is through the
    ``hxl.input.tagger()`` function:

    ```
    dataset = hxl.input.tagger(url_or_filename, specs)
    ```

    """

    def __init__(self, input, specs=[], default_tag=None, match_all=False):
        """Construct a new Tagger object.

        The input spec is a list of tuples, where the first item is a
        substring to match (case-/space-/punctuation-insensitive), and
        the second item is the HXL tag spec to use.

        Example:
        ```
        spec = [
            ['Cluster', '#sector'],
            ['Organisation', '#org'],
            ['Province', '#adm1+es']
        ]
        ```

        Args:
            input (hxl.input.AbstractInput): an input source that can yield rows of values (see ``hxl.input.make_input``).
            specs (dict): the input specs, as described above (default: [])
             match_all (bool): if True, require that the full header string match; otherwise, match substrings (default: False)
            default_tag (str): default tagspec to use for any column without a match
        """
        if isinstance(specs, dict):
            # convert to list of tuples if needed
            specs = [(key, specs[key]) for key in specs]
        self.specs = [(hxl.datatypes.normalise_string(spec[0]), spec[1]) for spec in specs]
        self.default_tag = default_tag
        self.match_all = match_all
        self.input = iter(input)
        self._cache = []
        self._found_tags = False

    def __next__(self):
        """Return the next line of input (including the new tags)."""
        if not self._found_tags:
            # Search the first 25 rows for a match.
            if self._add_tags():
                self._found_tags = True
            else:
                # if no match, through an exception
                raise hxl.HXLException("Tagging failed")
        if len(self._cache) > 0:
            # read from the cache, first
            return self._cache.pop(0)
        else:
            return next(self.input)

    def _add_tags(self):
        """Look for headers in the first 25 rows of data.
        @return: True if headers were found matching the tagging specs; False otherwise.
        """
        for n in range(0, 25):
            raw_row = next(self.input)
            if raw_row is None:
                break
            self._cache.append(raw_row)
            tag_row = self._try_tag_row(raw_row)
            if tag_row:
                self._cache.append(tag_row)
                return True
        return False

    def _try_tag_row(self, raw_row):
        """See if we can match a potential header row with the spec headers.
        @param raw_row: the row to check
        @return: the row of hashtag specs if successful, or None otherwise.
        """
        tags = []
        tag_count = 0
        for index, value in enumerate(raw_row):
            value = hxl.datatypes.normalise_string(value)
            for spec in self.specs:
                if self._check_header(spec[0], value):
                    tags.append(spec[1])
                    tag_count += 1
                    break
            else:
                # run only if nothing found
                tags.append('')
        if tag_count > 0 and tag_count/float(len(self.specs)) >= 0.5:
            if self.default_tag:
                tags = [tag or self.default_tag for tag in tags]
            return tags
        else:
            return None

    def _check_header(self, spec, header):
        """Check if an individual header matches a spec for tagging.
        Assumes that both the spec and the header have already been
        case- and whitespace-normalised. If self.match_all is True,
        then the spec must match the header completely; otherwise, it
        needs to match only a substring.
        @param spec: the spec to match
        @param header: the header to test
        @return True if there's a match; False otherwise
        """
        if self.match_all:
            return (spec == header)
        else:
            return (spec in header)

    # this class is its own iterator
    def __iter__(self):
        return self

    _SPEC_PATTERN = r'^(.+)(#{token}([+]{token})*)$'.format(token=hxl.datatypes.TOKEN_PATTERN)
    """Regular-expression pattern for matching a tagging specification as a string"""

    @staticmethod
    def parse_spec(s):
        """Parse a JSON-like tagger spec

        The string is in the format "HEADER TEXT#hashtag+attributes"

        Example:
        ```
        spec = hxl.converters.Tagger.parse_spec("Organisation name#org+name")
        ```

        Used only by the command-line tools.

        Args:
            s (str): the string representing a tagging specification

        Returns:
            hxl.model.Column: the parsed specification as a column object (header, hashtags, and attributes)

        Raises:
            hxl.filters.HXLFilterException: if there is an error parsing the spec

        """
        result = re.match(Tagger._SPEC_PATTERN, s)
        if result:
            return (result.group(1), hxl.model.Column.parse(result.group(2), use_exception=True).display_tag)
        else:
            raise HXLFilterException("Bad tagging spec: " + s)

    @staticmethod
    def _load(input, spec):
        """Create a tagger from a dict spec.

        Example:
        ```
        {
          "match_all": false,
          "default_tag": "#affected+label",
          "specs": [
            ["district", "#adm1+name"],
            ["p-code", "#adm1+code+v_pcode"],
            ["organi", "#org+name"]
          ]
        }
        ```

        """
        return Tagger(
            input=input,
            specs=spec.get('specs', []),
            default_tag=spec.get('default_tag', None),
            match_all=spec.get('match_all', False)
        )

Classes

class Tagger (input, specs=[], default_tag=None, match_all=False)

Add HXL hashtags to a non-HXL datasource on the fly.

Example:

input = hxl.input.make_input(url_or_filename)
specs = [('Cluster', '#sector'), ('Organi', '#org'), ('province', '#adm1+es')]
dataset = hxl.converters.Tagger(input, specs)

The more-common way to invoke the tagger is through the tagger() function:

dataset = hxl.input.tagger(url_or_filename, specs)

Construct a new Tagger object.

The input spec is a list of tuples, where the first item is a substring to match (case-/space-/punctuation-insensitive), and the second item is the HXL tag spec to use.

Example:

spec = [
    ['Cluster', '#sector'],
    ['Organisation', '#org'],
    ['Province', '#adm1+es']
]

Args

input : AbstractInput
an input source that can yield rows of values (see make_input()).
specs : dict
the input specs, as described above (default: [])
match_all (bool): if True, require that the full header string match; otherwise, match substrings (default: False)
default_tag : str
default tagspec to use for any column without a match
Expand source code
class Tagger(hxl.input.AbstractInput):
    """Add HXL hashtags to a non-HXL datasource on the fly.

    Example:
    ```
    input = hxl.input.make_input(url_or_filename)
    specs = [('Cluster', '#sector'), ('Organi', '#org'), ('province', '#adm1+es')]
    dataset = hxl.converters.Tagger(input, specs)
    ```

    The more-common way to invoke the tagger is through the
    ``hxl.input.tagger()`` function:

    ```
    dataset = hxl.input.tagger(url_or_filename, specs)
    ```

    """

    def __init__(self, input, specs=[], default_tag=None, match_all=False):
        """Construct a new Tagger object.

        The input spec is a list of tuples, where the first item is a
        substring to match (case-/space-/punctuation-insensitive), and
        the second item is the HXL tag spec to use.

        Example:
        ```
        spec = [
            ['Cluster', '#sector'],
            ['Organisation', '#org'],
            ['Province', '#adm1+es']
        ]
        ```

        Args:
            input (hxl.input.AbstractInput): an input source that can yield rows of values (see ``hxl.input.make_input``).
            specs (dict): the input specs, as described above (default: [])
             match_all (bool): if True, require that the full header string match; otherwise, match substrings (default: False)
            default_tag (str): default tagspec to use for any column without a match
        """
        if isinstance(specs, dict):
            # convert to list of tuples if needed
            specs = [(key, specs[key]) for key in specs]
        self.specs = [(hxl.datatypes.normalise_string(spec[0]), spec[1]) for spec in specs]
        self.default_tag = default_tag
        self.match_all = match_all
        self.input = iter(input)
        self._cache = []
        self._found_tags = False

    def __next__(self):
        """Return the next line of input (including the new tags)."""
        if not self._found_tags:
            # Search the first 25 rows for a match.
            if self._add_tags():
                self._found_tags = True
            else:
                # if no match, through an exception
                raise hxl.HXLException("Tagging failed")
        if len(self._cache) > 0:
            # read from the cache, first
            return self._cache.pop(0)
        else:
            return next(self.input)

    def _add_tags(self):
        """Look for headers in the first 25 rows of data.
        @return: True if headers were found matching the tagging specs; False otherwise.
        """
        for n in range(0, 25):
            raw_row = next(self.input)
            if raw_row is None:
                break
            self._cache.append(raw_row)
            tag_row = self._try_tag_row(raw_row)
            if tag_row:
                self._cache.append(tag_row)
                return True
        return False

    def _try_tag_row(self, raw_row):
        """See if we can match a potential header row with the spec headers.
        @param raw_row: the row to check
        @return: the row of hashtag specs if successful, or None otherwise.
        """
        tags = []
        tag_count = 0
        for index, value in enumerate(raw_row):
            value = hxl.datatypes.normalise_string(value)
            for spec in self.specs:
                if self._check_header(spec[0], value):
                    tags.append(spec[1])
                    tag_count += 1
                    break
            else:
                # run only if nothing found
                tags.append('')
        if tag_count > 0 and tag_count/float(len(self.specs)) >= 0.5:
            if self.default_tag:
                tags = [tag or self.default_tag for tag in tags]
            return tags
        else:
            return None

    def _check_header(self, spec, header):
        """Check if an individual header matches a spec for tagging.
        Assumes that both the spec and the header have already been
        case- and whitespace-normalised. If self.match_all is True,
        then the spec must match the header completely; otherwise, it
        needs to match only a substring.
        @param spec: the spec to match
        @param header: the header to test
        @return True if there's a match; False otherwise
        """
        if self.match_all:
            return (spec == header)
        else:
            return (spec in header)

    # this class is its own iterator
    def __iter__(self):
        return self

    _SPEC_PATTERN = r'^(.+)(#{token}([+]{token})*)$'.format(token=hxl.datatypes.TOKEN_PATTERN)
    """Regular-expression pattern for matching a tagging specification as a string"""

    @staticmethod
    def parse_spec(s):
        """Parse a JSON-like tagger spec

        The string is in the format "HEADER TEXT#hashtag+attributes"

        Example:
        ```
        spec = hxl.converters.Tagger.parse_spec("Organisation name#org+name")
        ```

        Used only by the command-line tools.

        Args:
            s (str): the string representing a tagging specification

        Returns:
            hxl.model.Column: the parsed specification as a column object (header, hashtags, and attributes)

        Raises:
            hxl.filters.HXLFilterException: if there is an error parsing the spec

        """
        result = re.match(Tagger._SPEC_PATTERN, s)
        if result:
            return (result.group(1), hxl.model.Column.parse(result.group(2), use_exception=True).display_tag)
        else:
            raise HXLFilterException("Bad tagging spec: " + s)

    @staticmethod
    def _load(input, spec):
        """Create a tagger from a dict spec.

        Example:
        ```
        {
          "match_all": false,
          "default_tag": "#affected+label",
          "specs": [
            ["district", "#adm1+name"],
            ["p-code", "#adm1+code+v_pcode"],
            ["organi", "#org+name"]
          ]
        }
        ```

        """
        return Tagger(
            input=input,
            specs=spec.get('specs', []),
            default_tag=spec.get('default_tag', None),
            match_all=spec.get('match_all', False)
        )

Ancestors

Static methods

def parse_spec(s)

Parse a JSON-like tagger spec

The string is in the format "HEADER TEXT#hashtag+attributes"

Example:

spec = hxl.converters.Tagger.parse_spec("Organisation name#org+name")

Used only by the command-line tools.

Args

s : str
the string representing a tagging specification

Returns

Column
the parsed specification as a column object (header, hashtags, and attributes)

Raises

HXLFilterException
if there is an error parsing the spec
Expand source code
@staticmethod
def parse_spec(s):
    """Parse a JSON-like tagger spec

    The string is in the format "HEADER TEXT#hashtag+attributes"

    Example:
    ```
    spec = hxl.converters.Tagger.parse_spec("Organisation name#org+name")
    ```

    Used only by the command-line tools.

    Args:
        s (str): the string representing a tagging specification

    Returns:
        hxl.model.Column: the parsed specification as a column object (header, hashtags, and attributes)

    Raises:
        hxl.filters.HXLFilterException: if there is an error parsing the spec

    """
    result = re.match(Tagger._SPEC_PATTERN, s)
    if result:
        return (result.group(1), hxl.model.Column.parse(result.group(2), use_exception=True).display_tag)
    else:
        raise HXLFilterException("Bad tagging spec: " + s)

Inherited members