Source code for pvpltools.dataplusmeta
"""
This module contains classes/functions to manage small collections
of tabular data and corresponding metadata.
The main class, called DataPlusMeta, bundles the following information:
3. tabular data as a pandas DataFrame
2. column definitions for the data, also as a pandas DataFrame
1. additional metadata in a (possibly nested) dictionary
0. an optional string identifying the data source
Methods are provided for reading and writing text files. These will
consist of three mandatory sections:
1. metadata in yaml format
2. column definitions as a csv table
3. data columns as a csv table
The three sections are separated by a two consecutive blank lines.
Note on dates and times:
Data columns which contain pandas Timestamps will be written
to the text files in the pandas default format. Data type (dtype)
information is stored in the columns definitions table, and is used
to identify date/time columns that need parsing when read back in.
DataPlusMeta will probably be extended to read/write in other formats,
such as hdf5 or native Excel.
Copyright (c) 2019-2020 Anton Driesse, PV Performance Labs.
"""
from warnings import warn
from io import StringIO
import pandas as pd
from ruamel.yaml import YAML
yaml = YAML(typ='rt')
yaml.default_flow_style = None
yaml.allow_unicode = True
# The following constants govern the formatting and parsing of text files.
# Best to leave them as they are to maintain file compatibility.
PREAMBLE = """\
# This file contains three sections separated by two blank lines.
# The first section contains meta data, which can be parsed with yaml.
# The second and third sections contain data column definitions
# and data respectively, both formatted as csv tables.
"""
# On second thought, let's not use the preamble.
PREAMBLE = ''
UTF8_BOM = u'\uFEFF'
ENCODING = 'utf-8'
SECTION_SEPARATOR = '\n\n'
COMMENT_CHAR = '#'
COMMENT_LINE = COMMENT_CHAR + '\n'
BLANK_LINE = '\n'
# TODO find a way to optionally accept fractional seconds
DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
READ_CSV_OPTIONS = dict(skipinitialspace=True,
comment=COMMENT_CHAR,
float_precision='round_trip')
TO_CSV_OPTIONS = {"lineterminator": "\n"}
[docs]
class DataPlusMeta():
"""
Class to bundle tabular data with metadata, with methods to read
and write files.
Author: Anton Driesse, PV Performance Labs
data : pandas.DataFrame, default=None
Two-dimensional tabular data.
cdef : pandas.DataFrame, default=None
Column definitions for the tabular data. Should logically have one
row for each column in data. (This may be enforced in file
operations.)
meta : dict, default=None
Dictionary containing any useful (or not) metadata. May contain
lists and other nested dictionaries.
source : str, default=None
Identifies the source of the data, if needed. Typically a file
name.
"""
[docs]
def __init__(self, data=None, cdef=None, meta=None, source=None):
self.data = data
self.cdef = cdef
self.meta = meta or {}
self.source = source or ''
# Check and warn if cdef does not match data
self.check_cdef(raise_on_mismatch=False)
return
def __repr__(self):
'''
Produce a simple string describing self.
'''
if self.data is None:
ndata = 0
else:
ndata = self.data.shape[1] + 1
classname = self.__class__.__name__
source = self.source or 'unknown source'
return ('%s object with %d data columns from %s.' %
(classname, ndata, source))
[docs]
def check_cdef(self, raise_on_mismatch=False):
'''
Check whether cdef is consistent with data. This means matching
labels and matching dtype values.
A mismatch generates a warning by default, or can raise a RuntimeError.
'''
if (self.cdef is None) & (self.data is None):
mismatch = False
elif (self.cdef is None) ^ (self.data is None):
mismatch = True
message = 'Either cdef or data is missing.'
else:
dtypes = self.data.reset_index().dtypes.astype(str)
if not all(self.cdef.index == dtypes.index):
mismatch = True
message = 'Labels in cdef do not match labels in data.'
elif 'dtype' not in self.cdef.columns:
mismatch = True
message = 'No dtypes in cdef.'
elif not all(self.cdef.dtype == dtypes):
mismatch = True
message = 'dtypes in cdef do not match dtypes in data'
else:
mismatch = False
if mismatch:
if raise_on_mismatch:
raise RuntimeError(message)
else:
warn(message)
return False
return True
[docs]
def update_cdef(self, raise_on_fail=True):
'''
Update the dtype column in cdef with dtype values found in data,
or create a new cdef table and/or dtype column.
Optionally raises a RuntimeError if cdef labels do not match
data labels.
'''
if self.data is None:
self.cdef = None
else:
dtypes = self.data.reset_index().dtypes.astype(str)
if self.cdef is None:
self.cdef = pd.DataFrame(dtypes, columns=['dtype'])
self.cdef.index.name = 'column'
elif all(self.cdef.index == dtypes.index):
self.cdef['dtype'] = dtypes
else:
message = 'Labels in cdef do not match labels in data.'
if raise_on_fail:
raise RuntimeError(message)
else:
warn(message)
return False
return True
[docs]
@classmethod
def from_txt(cls, file, use_dtypes=True):
"""
Read the contents of a text file to create a DataPlusMeta object.
file : str
Name or path of a text file
"""
# read the entire file so that it can be split easily
with open(file, encoding=ENCODING) as f:
buffer = f.read()
sections = buffer.split('\n' + SECTION_SEPARATOR)
# having 3 sections is a basic requirement
if len(sections) != 3:
raise RuntimeError('%s does not have three sections.' % file)
# parse meta
meta = yaml.load(sections[0])
# parse column definitions
cdef = pd.read_csv(StringIO(sections[1]), index_col=0,
dtype=dict(dtype=str),
**READ_CSV_OPTIONS)
if use_dtypes and ('dtype' in cdef.columns):
dtypes = cdef['dtype'].dropna()
# identify timestamps
timestamps = dtypes.str.startswith('datetime')
# pass others to read_csv
dtype_dict = dtypes[~timestamps].to_dict()
data = pd.read_csv(StringIO(sections[2]), index_col=None,
dtype=dtype_dict, **READ_CSV_OPTIONS)
# post-parse date columns with standard format
for col in dtypes[timestamps].index:
data[col] = pd.to_datetime(data[col], format=DATE_FORMAT)
else:
# tolerate missing dtype in cdef for files created by other means
data = pd.read_csv(StringIO(sections[2]), index_col=None,
**READ_CSV_OPTIONS)
# set index column for normal use
data = data.set_index(data.columns[0])
return cls(data, cdef, meta, source=file)
[docs]
def to_txt(self, file, update_cdef=True, preamble=None):
"""
Write the contents of a DataPlusMeta object to a text file.
file : str
Name or path of a text file
Specified file is overwritten without warning if it exists.
"""
if self.data is None:
raise RuntimeError('There is no data to store.')
with open(file, 'w'):
pass
if update_cdef:
self.update_cdef(raise_on_fail=True)
else:
self.check_cdef(raise_on_mismatch=True)
# fill in missing names, if needed
if self.cdef.index.name is None:
self.cdef.index.name = 'column'
if self.data.index.name is None:
self.data.index.name = self.cdef.index[0]
with open(file, 'w', encoding=ENCODING) as f:
f.write(UTF8_BOM)
if preamble:
lines = preamble.splitlines()
for line in lines:
f.write(COMMENT_CHAR + ' ' + line + '\n')
f.write(BLANK_LINE)
yaml.dump(self.meta, stream=f)
f.write(SECTION_SEPARATOR)
# Note: line_terminator is set to '\n' to avoid double conversion
# to \n\r on Windows when the buffer is written to the file
self.cdef[:0].to_csv(f, index=True, **TO_CSV_OPTIONS)
f.write(BLANK_LINE)
self.cdef.to_csv(f, header=False, index=True, **TO_CSV_OPTIONS)
f.write(SECTION_SEPARATOR)
self.data.iloc[:0].to_csv(f, index=True, **TO_CSV_OPTIONS)
f.write(BLANK_LINE)
self.data.to_csv(f, header=False, index=True, **TO_CSV_OPTIONS)
return