Source code for uw.stringtools
"""String-handling utilities.
Contains a procedure to facilitate multiple string substitutions, and a
slight enhancement of the Python-standard :meth:`str.strip`.
"""
from collections import namedtuple
import csv
import re
[docs]def multiReplace (changes):
r"""Create a procedure to perform multiple string substitutions.
:param dict changes: dictionary of substitutions; keys will be replaced by
values.
The returned procedure expects a string and returns a string. The result
string consists of the input string with the specified substitutions
performed.
The changes are a dictionary of find/replace values. The "find" values
(dictionary keys) are simultaneously found and each occurrence is
replaced by the corresponding "replace" value (dictionary value).
Works by constructing a regular expression corresponding to the
alternation of the keys of the dictionary. In the event that "find"
strings overlap, behaviour is determined by the :mod:`re` library.
A special case allows for correct handling of the "empty dictionary"
case of no replacements.
"""
if len (changes):
pattern = re.compile ('|'.join
([re.escape (key) for key in changes.keys ()]))
else:
# The empty alternation is the "match no strings" RE, which is not
# the same as the RE represented by the empty string. It is
# impossible to match a character followed by the beginning of the
# string, so this works.
pattern = re.compile (r".\A")
def handleMatch (match):
return changes[match.group (0)]
return lambda string: re.sub (pattern, handleMatch, string)
[docs]def strip_string (s):
"""Trim whitespace off a string, and recognize empty strings.
:param str s: the string to strip.
This is intended for processing text form fields. It strips whitespace
off the ends of the string; if an empty string is all that remains, None
is returned. None is also returned if s is None.
"""
return None if s is None else s.strip () or None
[docs]def normalize_table_whitespace (grid):
"""Normalize whitespace for a 2 dimensional array of strings.
:param grid: An iterable of lists of strings.
:return: An iterator of lists of strings.
Apply :func:`strip_string` to each element of the provided 2D array of
strings; also use None values to extend each element of the outer array to
the length of the longest element.
"""
grid = tuple (grid)
columns = max (len (line) for line in grid)
return ([strip_string (field) for field in line] +
[None] * (columns - len (line)) for line in grid)
[docs]def split_upload_text (text):
"""Split field content text into a rectangular array of field values.
:param str text: The field content text.
:return: A list of lists of field values.
The field content is split into lines. Then each line is divided either
on tabs (if every line has a tab), or according to CSV rules (otherwise).
Finally, the result is normalized using :func:`normalize_table_whitespace`.
"""
lines = text.splitlines ()
if all ('\t' in line for line in lines):
# Copy/pasted tab-delimited from Excel
grid = (line.split ('\t') for line in lines)
else:
# CSV file upload
grid = csv.reader (lines)
return normalize_table_whitespace (grid)
[docs]def parse_tabular_file (columns, request, header_skip=0, line_field=None):
"""Parse a file of tabular data according to the specified columns.
:param list columns: A list of column specifications. Each one must be a
3-tuple consisting of the heading in the file which will identify the
column, the fieldname in the result of the parse, and a parser to
process values from the column.
:param request: A 2D array of values representing the input. Each value
is either a string or None; None values will not be parsed but just
passed through as None to the output.
:param int header_skip: The number of initial header lines to skip.
:param str line_field: The field name in the output to set to the line
number, or None for no such field name.
:return: A 2-tuple of result rows and errors.
The first non-skipped line is interpreted as column headings. These
are matched with the first elements of the provided columns to
identify which columns of the file will be parsed. Additional columns
not given in columns are permitted, but all columns given in columns must
be present in the headings line.
The remaining rows are parsed. Each element that corresponds to one
of the provided columns is parsed using the corresponding parser. Each row
becomes a namedtuple.
"""
for c in range (1, 1 + header_skip + 1):
try:
header_row = next (request)
except StopIteration:
return None, [(c, None, 'Header row missing')]
fieldmap = dict ((col, j) for j, col in enumerate (header_row))
headings, fields, parsers = zip (*columns)
if line_field is not None:
fields = (line_field,) + fields
nt = namedtuple ('__'.join (fields), fields)
indices = [fieldmap.get (h) for h in headings]
errors = []
for h, i in zip (headings, indices):
if h not in fieldmap:
errors.append ((c, h, 'Required field heading “%s” missing' % h))
if errors:
return None, errors
result = []
for c, row in enumerate (request, start=c + 1):
rowvalues = []
rowerrors = []
for p, i, h in zip (parsers, indices, headings):
cell = row[i]
if cell is None:
rowvalues.append (None)
else:
try:
rowvalues.append (p (cell))
except ValueError as x:
rowerrors.append ((c, h, x.args))
rowvalues.append (None)
if rowerrors:
errors.extend (rowerrors)
else:
if any (v is not None for v in rowvalues):
if line_field is not None:
rowvalues.insert (0, c)
result.append (nt (*rowvalues))
return result, errors