Source code for uw.local.wcms.bin.templates

"""Program to pre-process templates from WCMS.
"""

from pathlib import Path
import re
from urllib.parse import urlparse, urljoin

from bs4 import BeautifulSoup

[docs]def re_capt (**keys): """Construct a regular expression to capture the specified contents. :param dict keys: A dictionary from capture group names to corresponding text regular expressions. :return: A dictionary from capture group names to corresponding regular expressions in the form (?P[capture]<[contents]>) where [capture] is replaced by the capture name and [contents] is replaced by the contents regular expression. """ return {capture: '(?P<%s>%s)' % (capture, contents) for capture, contents in keys.items ()}
url_re = re.compile ('^https://uwaterloo.ca/%(site)s/%(path)s$' % re_capt (site='[^/]*', path='.*')) # Fix & adjust template
[docs]def fix_url (url): """Adjust the given URL to be absolute if necessary. :param url: The URL to adjust. :return: The adjusted URL. A fragment-only URL is left unchanged; anything else is resolved relative to the root of the main UW site so the links end up going the same place they did in the original page. """ parsed = urlparse (url) if parsed[0:5] != ('',) * 5: url = urljoin ('https://uwaterloo.ca', url) return url
[docs]def fix_template (template, template_dir): """Make required adjustments and corrections to the template """ # Clear space for login/logout link template.find ('div', id="cas_login").clear () # Fix relative URLs to be absolute for href in template.find_all (href=True): href['href'] = fix_url (href['href']) for href in template.find_all (src=True): href['src'] = fix_url (href['src']) # Replace home page link if needed sitelink = (template .find ('div', id="site-header") .find ('div', class_="uw-section--inner") .a) sitetitlefile = template_dir / 'sitetitle' if sitetitlefile.exists (): sitetitle = sitetitlefile.read_text ().strip () sitelink.string = sitetitle sitelink['title'] = sitetitle else: sitetitle = sitelink.string.strip () sitehomefile = template_dir / 'sitehome' if sitehomefile.exists (): sitelink['href'] = sitehomefile.read_text ().strip () # Blank out breadcrumbs breadcrumbdiv = (template .find ('div', id="main") .find ('div', class_="uw-site--breadcrumb") .find ('nav', class_="breadcrumb") .find ('ol')) breadcrumbdiv.clear () # Insert reference to fonts.css # **TODO: Make this adjustment when and only when deploying to non-Odyssey # servers, rather than looking at which site is involved. if template_dir.name.startswith ('odyssey'): server = '' else: server = '//odyssey.uwaterloo.ca' template.head.append (template.new_tag ('link', href=server + "/static/wcms/main.css", media="all", rel="stylesheet", type="text/css" )) # Clear out menu, including "Information For" mainmenu = template.find ('div', id="main-menu") mainmenu.clear () menuparent = mainmenu.parent menuparent.clear () menuparent.append (mainmenu) # Blank out content areas template.title.clear () titlespan = template.new_tag ('span', id="wcms-main-title") template.title.append (titlespan) template.title.append (' | %s | University of Waterloo' % sitetitle) template.find ('div', class_="uw-site--title").h1.clear () template.find ('div', id="site-sidebar-wrapper").find ('div', class_="content").clear () # This removes the content, but also removes the sharing links for # reasons that are not entirely clear. template.find ('div', id="content").clear () # The sharing links usually get installed into the following, but they # aren't that useful anyway in our context so just remove it all: template.find ('div', id="uw-site-share").decompose () # Remove unneeded metadata template.body['class'] = [c for c in template.body['class'] if not c.startswith ("page-node-")] for item in ([] + template.head.find_all ('link', rel=["canonical", "shortlink"]) + template.head.find_all ('meta', property=["og:url", "og:title", "og:description", "og:updated_time", "article:published_time", "article:modified_time"]) + template.head.find_all ('meta', attrs={'name': ["description", "twitter:url", "twitter:title", "twitter:description"]}) ): item.decompose () # Insert footer associated with this template footerfile = template_dir / 'footer' if footerfile.exists (): footerdiv = (template .find ('div', id="site-footer") .find ('div', class_="uw-section--inner")) footerdiv.clear () footer = BeautifulSoup (footerfile.read_text (), 'lxml') if footer.body is not None: footerbody = footer.body.extract () footerdiv.append (footerbody) footerbody.unwrap () # Insert extra <head> contents associated with this template headerfile = template_dir / 'header' if headerfile.exists (): header = BeautifulSoup (headerfile.read_text (), 'lxml') if header.head is not None: headerhead = header.head.extract () template.head.append (headerhead) headerhead.unwrap ()
[docs]def process_template (template_dir, download_dir, output_dir): """Process a single template. :param template_dir: The directory with information specifying the details of the template to process. Must contain a file specifying the source URL of the template. :param download_dir: The base directory for the template download. Must contain the template file downloaded by the download process. :param output_dir: The directory for the processed template. Reads the source URL and uses it to find the input template. Processes the template document and writes the output to a corresponding filename in the output directory. """ urlfile = template_dir / 'source' url = urlfile.read_text () m = url_re.match (url) if m is None: raise ValueError ('Invalid URL in %s: %s', urlfile, url) # Compute path of downloaded template file rawfile = download_dir / 'uwaterloo.ca' / m.group ('site') / m.group ('path') if not m.group ('path') or m.group ('path').endswith ('/'): rawfile /= 'index.html' # Parse template template = BeautifulSoup (rawfile.read_text (), 'lxml') # Adjust template contents fix_template (template, template_dir) # Save adjusted template (output_dir / template_dir.name).write_text (str (template))
[docs]def process_templates (datadir, download_dir, output_dir): """Process all templates. :param datadir: The base directory containing a directory for each template with information specifying the details of the template. :param download_dir: The base directory for the template download. :param output_dir: The directory for the processed templates. """ for template_dir in datadir.iterdir (): print (template_dir) process_template (template_dir, download_dir, output_dir)
datadir = Path.home () / 'git/odyssey/project/wcms/data' wcmsbasedir = Path.home () / 'wcms' downloaddir = wcmsbasedir / 'download' templatedir = wcmsbasedir / 'template'
[docs]def main (): process_templates (datadir, downloaddir, templatedir)