Source code for uw.stringtools

"""String-handling utilities.

Contains a procedure to facilitate multiple string substitutions, and a
slight enhancement of the Python-standard :meth:`str.strip`.
"""

from collections import namedtuple
import csv
import re

[docs]def multiReplace (changes): r"""Create a procedure to perform multiple string substitutions. :param dict changes: dictionary of substitutions; keys will be replaced by values. The returned procedure expects a string and returns a string. The result string consists of the input string with the specified substitutions performed. The changes are a dictionary of find/replace values. The "find" values (dictionary keys) are simultaneously found and each occurrence is replaced by the corresponding "replace" value (dictionary value). Works by constructing a regular expression corresponding to the alternation of the keys of the dictionary. In the event that "find" strings overlap, behaviour is determined by the :mod:`re` library. A special case allows for correct handling of the "empty dictionary" case of no replacements. """ if len (changes): pattern = re.compile ('|'.join ([re.escape (key) for key in changes.keys ()])) else: # The empty alternation is the "match no strings" RE, which is not # the same as the RE represented by the empty string. It is # impossible to match a character followed by the beginning of the # string, so this works. pattern = re.compile (r".\A") def handleMatch (match): return changes[match.group (0)] return lambda string: re.sub (pattern, handleMatch, string)
[docs]def strip_string (s): """Trim whitespace off a string, and recognize empty strings. :param str s: the string to strip. This is intended for processing text form fields. It strips whitespace off the ends of the string; if an empty string is all that remains, None is returned. None is also returned if s is None. """ return None if s is None else s.strip () or None
[docs]def normalize_table_whitespace (grid): """Normalize whitespace for a 2 dimensional array of strings. :param grid: An iterable of lists of strings. :return: An iterator of lists of strings. Apply :func:`strip_string` to each element of the provided 2D array of strings; also use None values to extend each element of the outer array to the length of the longest element. """ grid = tuple (grid) columns = max (len (line) for line in grid) return ([strip_string (field) for field in line] + [None] * (columns - len (line)) for line in grid)
[docs]def split_upload_text (text): """Split field content text into a rectangular array of field values. :param str text: The field content text. :return: A list of lists of field values. The field content is split into lines. Then each line is divided either on tabs (if every line has a tab), or according to CSV rules (otherwise). Finally, the result is normalized using :func:`normalize_table_whitespace`. """ lines = text.splitlines () if all ('\t' in line for line in lines): # Copy/pasted tab-delimited from Excel grid = (line.split ('\t') for line in lines) else: # CSV file upload grid = csv.reader (lines) return normalize_table_whitespace (grid)
[docs]def parse_tabular_file (columns, request, header_skip=0, line_field=None): """Parse a file of tabular data according to the specified columns. :param list columns: A list of column specifications. Each one must be a 3-tuple consisting of the heading in the file which will identify the column, the fieldname in the result of the parse, and a parser to process values from the column. :param request: A 2D array of values representing the input. Each value is either a string or None; None values will not be parsed but just passed through as None to the output. :param int header_skip: The number of initial header lines to skip. :param str line_field: The field name in the output to set to the line number, or None for no such field name. :return: A 2-tuple of result rows and errors. The first non-skipped line is interpreted as column headings. These are matched with the first elements of the provided columns to identify which columns of the file will be parsed. Additional columns not given in columns are permitted, but all columns given in columns must be present in the headings line. The remaining rows are parsed. Each element that corresponds to one of the provided columns is parsed using the corresponding parser. Each row becomes a namedtuple. """ for c in range (1, 1 + header_skip + 1): try: header_row = next (request) except StopIteration: return None, [(c, None, 'Header row missing')] fieldmap = dict ((col, j) for j, col in enumerate (header_row)) headings, fields, parsers = zip (*columns) if line_field is not None: fields = (line_field,) + fields nt = namedtuple ('__'.join (fields), fields) indices = [fieldmap.get (h) for h in headings] errors = [] for h, i in zip (headings, indices): if h not in fieldmap: errors.append ((c, h, 'Required field heading “%s” missing' % h)) if errors: return None, errors result = [] for c, row in enumerate (request, start=c + 1): rowvalues = [] rowerrors = [] for p, i, h in zip (parsers, indices, headings): cell = row[i] if cell is None: rowvalues.append (None) else: try: rowvalues.append (p (cell)) except ValueError as x: rowerrors.append ((c, h, x.args)) rowvalues.append (None) if rowerrors: errors.extend (rowerrors) else: if any (v is not None for v in rowvalues): if line_field is not None: rowvalues.insert (0, c) result.append (nt (*rowvalues)) return result, errors