Source code for uw.local.wcms.bin.templates

"""Program to pre-process templates from WCMS.
"""

from pathlib import Path
import re
from urllib.parse import urlparse, urljoin

from bs4 import BeautifulSoup

[docs]def re_capt (**keys):
    """Construct a regular expression to capture the specified contents.

    :param dict keys: A dictionary from capture group names to corresponding
        text regular expressions.
    :return: A dictionary from capture group names to corresponding regular
        expressions in the form (?P[capture]<[contents]>) where [capture] is
        replaced by the capture name and [contents] is replaced by the contents
        regular expression.
    """
    return {capture: '(?P<%s>%s)' % (capture, contents)
        for capture, contents in keys.items ()}

url_re = re.compile ('^https://uwaterloo.ca/%(site)s/%(path)s$' %
    re_capt (site='[^/]*', path='.*'))

# Fix & adjust template
[docs]def fix_url (url):
    """Adjust the given URL to be absolute if necessary.

    :param url: The URL to adjust.
    :return: The adjusted URL.

    A fragment-only URL is left unchanged; anything else is resolved relative
    to the root of the main UW site so the links end up going the same place
    they did in the original page.
    """
    parsed = urlparse (url)
    if parsed[0:5] != ('',) * 5:
        url = urljoin ('https://uwaterloo.ca', url)
    return url

[docs]def fix_template (template, template_dir):
    """Make required adjustments and corrections to the template
    """
    # Clear space for login/logout link
    template.find ('div', id="cas_login").clear ()

    # Fix relative URLs to be absolute
    for href in template.find_all (href=True):
        href['href'] = fix_url (href['href'])
    for href in template.find_all (src=True):
        href['src'] = fix_url (href['src'])

    # Replace home page link if needed
    sitelink = (template
        .find ('div', id="site-header")
        .find ('div', class_="uw-section--inner")
        .a)
    sitetitlefile = template_dir / 'sitetitle'
    if sitetitlefile.exists ():
        sitetitle = sitetitlefile.read_text ().strip ()
        sitelink.string = sitetitle
        sitelink['title'] = sitetitle
    else:
        sitetitle = sitelink.string.strip ()
    sitehomefile = template_dir / 'sitehome'
    if sitehomefile.exists ():
        sitelink['href'] = sitehomefile.read_text ().strip ()

    # Blank out breadcrumbs
    breadcrumbdiv = (template
        .find ('div', id="main")
        .find ('div', class_="uw-site--breadcrumb")
        .find ('nav', class_="breadcrumb")
        .find ('ol'))
    breadcrumbdiv.clear ()

    # Insert reference to fonts.css
    # **TODO: Make this adjustment when and only when deploying to non-Odyssey
    # servers, rather than looking at which site is involved.
    if template_dir.name.startswith ('odyssey'):
        server = ''
    else:
        server = '//odyssey.uwaterloo.ca'
    template.head.append (template.new_tag ('link',
        href=server + "/static/wcms/main.css",
        media="all",
        rel="stylesheet",
        type="text/css"
    ))

    # Clear out menu, including "Information For"
    mainmenu = template.find ('div', id="main-menu")
    mainmenu.clear ()
    menuparent = mainmenu.parent
    menuparent.clear ()
    menuparent.append (mainmenu)

    # Blank out content areas
    template.title.clear ()
    titlespan = template.new_tag ('span', id="wcms-main-title")
    template.title.append (titlespan)
    template.title.append (' | %s | University of Waterloo' % sitetitle)
    template.find ('div', class_="uw-site--title").h1.clear ()
    template.find ('div', id="site-sidebar-wrapper").find ('div', class_="content").clear ()

    # This removes the content, but also removes the sharing links for
    # reasons that are not entirely clear.
    template.find ('div', id="content").clear ()

    # The sharing links usually get installed into the following, but they
    # aren't that useful anyway in our context so just remove it all:
    template.find ('div', id="uw-site-share").decompose ()

    # Remove unneeded metadata
    template.body['class'] = [c for c in template.body['class']
        if not c.startswith ("page-node-")]
    for item in ([]
            + template.head.find_all ('link', rel=["canonical", "shortlink"])
            + template.head.find_all ('meta', property=["og:url", "og:title", "og:description", "og:updated_time", "article:published_time", "article:modified_time"])
            + template.head.find_all ('meta', attrs={'name': ["description", "twitter:url", "twitter:title", "twitter:description"]})
        ):
        item.decompose ()

    # Insert footer associated with this template
    footerfile = template_dir / 'footer'
    if footerfile.exists ():
        footerdiv = (template
            .find ('div', id="site-footer")
            .find ('div', class_="uw-section--inner"))
        footerdiv.clear ()
        footer = BeautifulSoup (footerfile.read_text (), 'lxml')
        if footer.body is not None:
            footerbody = footer.body.extract ()
            footerdiv.append (footerbody)
            footerbody.unwrap ()

    # Insert extra <head> contents associated with this template
    headerfile = template_dir / 'header'
    if headerfile.exists ():
        header = BeautifulSoup (headerfile.read_text (), 'lxml')
        if header.head is not None:
            headerhead = header.head.extract ()
            template.head.append (headerhead)
            headerhead.unwrap ()

[docs]def process_template (template_dir, download_dir, output_dir):
    """Process a single template.

    :param template_dir: The directory with information specifying the
        details of the template to process.  Must contain a file specifying
        the source URL of the template.
    :param download_dir: The base directory for the template download.  Must
        contain the template file downloaded by the download process.
    :param output_dir: The directory for the processed template.

    Reads the source URL and uses it to find the input template.  Processes the
    template document and writes the output to a corresponding filename in the
    output directory.
    """
    urlfile = template_dir / 'source'
    url = urlfile.read_text ()
    m = url_re.match (url)
    if m is None:
        raise ValueError ('Invalid URL in %s: %s', urlfile, url)

    # Compute path of downloaded template file
    rawfile = download_dir / 'uwaterloo.ca' / m.group ('site') / m.group ('path')
    if not m.group ('path') or m.group ('path').endswith ('/'):
        rawfile /= 'index.html'

    # Parse template
    template = BeautifulSoup (rawfile.read_text (), 'lxml')

    # Adjust template contents
    fix_template (template, template_dir)

    # Save adjusted template
    (output_dir / template_dir.name).write_text (str (template))

[docs]def process_templates (datadir, download_dir, output_dir):
    """Process all templates.

    :param datadir: The base directory containing a directory for each
        template with information specifying the details of the template.
    :param download_dir: The base directory for the template download.
    :param output_dir: The directory for the processed templates.
    """
    for template_dir in datadir.iterdir ():
        print (template_dir)
        process_template (template_dir, download_dir, output_dir)

datadir = Path.home () / 'git/odyssey/project/wcms/data'
wcmsbasedir = Path.home () / 'wcms'
downloaddir = wcmsbasedir / 'download'
templatedir = wcmsbasedir / 'template'

[docs]def main ():
    process_templates (datadir, downloaddir, templatedir)