Source code for pvpltools.dataplusmeta

"""
This module contains classes/functions to manage small collections
of tabular data and corresponding metadata.

The main class, called DataPlusMeta, bundles the following information:

    3. tabular data as a pandas DataFrame
    2. column definitions for the data, also as a pandas DataFrame
    1. additional metadata in a (possibly nested) dictionary
    0. an optional string identifying the data source

Methods are provided for reading and writing text files.  These will
consist of three mandatory sections:

    1. metadata in yaml format
    2. column definitions as a csv table
    3. data columns as a csv table

The three sections are separated by a two consecutive blank lines.

Note on dates and times:
    Data columns which contain pandas Timestamps will be written
    to the text files in the pandas default format.  Data type (dtype)
    information is stored in the columns definitions table, and is used
    to identify date/time columns that need parsing when read back in.

DataPlusMeta will probably be extended to read/write in other formats,
such as hdf5 or native Excel.

Copyright (c) 2019-2020 Anton Driesse, PV Performance Labs.
"""

from warnings import warn
from io import StringIO
import pandas as pd

from ruamel.yaml import YAML
yaml = YAML(typ='rt')
yaml.default_flow_style = None
yaml.allow_unicode = True

# The following constants govern the formatting and parsing of text files.
# Best to leave them as they are to maintain file compatibility.

PREAMBLE = """\
# This file contains three sections separated by two blank lines.
# The first section contains meta data, which can be parsed with yaml.
# The second and third sections contain data column definitions
# and data respectively, both formatted as csv tables.
"""
# On second thought, let's not use the preamble.
PREAMBLE = ''
UTF8_BOM = u'\uFEFF'
ENCODING = 'utf-8'
SECTION_SEPARATOR = '\n\n'
COMMENT_CHAR = '#'
COMMENT_LINE = COMMENT_CHAR + '\n'
BLANK_LINE = '\n'
# TODO find a way to optionally accept fractional seconds
DATE_FORMAT = '%Y-%m-%d %H:%M:%S'

READ_CSV_OPTIONS = dict(skipinitialspace=True,
                        comment=COMMENT_CHAR,
                        float_precision='round_trip')

TO_CSV_OPTIONS = {"lineterminator": "\n"}



[docs]
class DataPlusMeta():
    """
    Class to bundle tabular data with metadata, with methods to read
    and write files.

    Author: Anton Driesse, PV Performance Labs

    data : pandas.DataFrame, default=None
        Two-dimensional tabular data.

    cdef : pandas.DataFrame, default=None
        Column definitions for the tabular data. Should logically have one
        row for each column in data. (This may be enforced in file
        operations.)

    meta : dict, default=None
        Dictionary containing any useful (or not) metadata. May contain
        lists and other nested dictionaries.

    source : str, default=None
        Identifies the source of the data, if needed.  Typically a file
        name.
    """

[docs]
    def __init__(self, data=None, cdef=None, meta=None, source=None):
        self.data = data
        self.cdef = cdef
        self.meta = meta or {}
        self.source = source or ''

        # Check and warn if cdef does not match data
        self.check_cdef(raise_on_mismatch=False)

        return


    def __repr__(self):
        '''
        Produce a simple string describing self.
        '''
        if self.data is None:
            ndata = 0
        else:
            ndata = self.data.shape[1] + 1

        classname = self.__class__.__name__
        source = self.source or 'unknown source'

        return ('%s object with %d data columns from %s.' %
                (classname, ndata, source))


[docs]
    def check_cdef(self, raise_on_mismatch=False):
        '''
        Check whether cdef is consistent with data.  This means matching
        labels and matching dtype values.

        A mismatch generates a warning by default, or can raise a RuntimeError.
        '''

        if (self.cdef is None) & (self.data is None):
            mismatch = False

        elif (self.cdef is None) ^ (self.data is None):
            mismatch = True
            message = 'Either cdef or data is missing.'

        else:
            dtypes = self.data.reset_index().dtypes.astype(str)

            if not all(self.cdef.index == dtypes.index):
                mismatch = True
                message = 'Labels in cdef do not match labels in data.'

            elif 'dtype' not in self.cdef.columns:
                mismatch = True
                message = 'No dtypes in cdef.'

            elif not all(self.cdef.dtype == dtypes):
                mismatch = True
                message = 'dtypes in cdef do not match dtypes in data'

            else:
                mismatch = False

        if mismatch:
            if raise_on_mismatch:
                raise RuntimeError(message)
            else:
                warn(message)
                return False
        return True



[docs]
    def update_cdef(self, raise_on_fail=True):
        '''
        Update the dtype column in cdef with dtype values found in data,
        or create a new cdef table and/or dtype column.

        Optionally raises a RuntimeError if cdef labels do not match
        data labels.
        '''
        if self.data is None:
            self.cdef = None
        else:
            dtypes = self.data.reset_index().dtypes.astype(str)

            if self.cdef is None:
                self.cdef = pd.DataFrame(dtypes, columns=['dtype'])
                self.cdef.index.name = 'column'

            elif all(self.cdef.index == dtypes.index):
                self.cdef['dtype'] = dtypes

            else:
                message = 'Labels in cdef do not match labels in data.'
                if raise_on_fail:
                    raise RuntimeError(message)
                else:
                    warn(message)
                    return False
        return True



[docs]
    @classmethod
    def from_txt(cls, file, use_dtypes=True):
        """
        Read the contents of a text file to create a DataPlusMeta object.

        file : str
            Name or path of a text file
        """

        # read the entire file so that it can be split easily
        with open(file, encoding=ENCODING) as f:
            buffer = f.read()

        sections = buffer.split('\n' + SECTION_SEPARATOR)

        # having 3 sections is a basic requirement
        if len(sections) != 3:
            raise RuntimeError('%s does not have three sections.' % file)

        # parse meta
        meta = yaml.load(sections[0])

        # parse column definitions
        cdef = pd.read_csv(StringIO(sections[1]), index_col=0,
                           dtype=dict(dtype=str),
                           **READ_CSV_OPTIONS)

        if use_dtypes and ('dtype' in cdef.columns):
            dtypes = cdef['dtype'].dropna()

            # identify timestamps
            timestamps = dtypes.str.startswith('datetime')

            # pass others to read_csv
            dtype_dict = dtypes[~timestamps].to_dict()

            data = pd.read_csv(StringIO(sections[2]), index_col=None,
                               dtype=dtype_dict, **READ_CSV_OPTIONS)

            # post-parse date columns with standard format
            for col in dtypes[timestamps].index:
                data[col] = pd.to_datetime(data[col], format=DATE_FORMAT)

        else:
            # tolerate missing dtype in cdef for files created by other means
            data = pd.read_csv(StringIO(sections[2]), index_col=None,
                               **READ_CSV_OPTIONS)

        # set index column for normal use
        data = data.set_index(data.columns[0])

        return cls(data, cdef, meta, source=file)



[docs]
    def to_txt(self, file, update_cdef=True, preamble=None):
        """
        Write the contents of a DataPlusMeta object to a text file.

        file : str
            Name or path of a text file

        Specified file is overwritten without warning if it exists.
        """
        if self.data is None:
            raise RuntimeError('There is no data to store.')

        with open(file, 'w'):
            pass

        if update_cdef:
            self.update_cdef(raise_on_fail=True)
        else:
            self.check_cdef(raise_on_mismatch=True)

        # fill in missing names, if needed
        if self.cdef.index.name is None:
            self.cdef.index.name = 'column'

        if self.data.index.name is None:
            self.data.index.name = self.cdef.index[0]

        with open(file, 'w', encoding=ENCODING) as f:
            f.write(UTF8_BOM)

            if preamble:
                lines = preamble.splitlines()
                for line in lines:
                    f.write(COMMENT_CHAR + ' ' + line + '\n')
                f.write(BLANK_LINE)

            yaml.dump(self.meta, stream=f)

            f.write(SECTION_SEPARATOR)

            # Note: line_terminator is set to '\n' to avoid double conversion
            # to \n\r on Windows when the buffer is written to the file

            self.cdef[:0].to_csv(f, index=True, **TO_CSV_OPTIONS)
            f.write(BLANK_LINE)
            self.cdef.to_csv(f, header=False, index=True, **TO_CSV_OPTIONS)

            f.write(SECTION_SEPARATOR)

            self.data.iloc[:0].to_csv(f, index=True, **TO_CSV_OPTIONS)
            f.write(BLANK_LINE)
            self.data.to_csv(f, header=False, index=True, **TO_CSV_OPTIONS)
        return