"""Program to pre-process templates from WCMS.
"""
from pathlib import Path
import re
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
[docs]def re_capt (**keys):
"""Construct a regular expression to capture the specified contents.
:param dict keys: A dictionary from capture group names to corresponding
text regular expressions.
:return: A dictionary from capture group names to corresponding regular
expressions in the form (?P[capture]<[contents]>) where [capture] is
replaced by the capture name and [contents] is replaced by the contents
regular expression.
"""
return {capture: '(?P<%s>%s)' % (capture, contents)
for capture, contents in keys.items ()}
url_re = re.compile ('^https://uwaterloo.ca/%(site)s/%(path)s$' %
re_capt (site='[^/]*', path='.*'))
# Fix & adjust template
[docs]def fix_url (url):
"""Adjust the given URL to be absolute if necessary.
:param url: The URL to adjust.
:return: The adjusted URL.
A fragment-only URL is left unchanged; anything else is resolved relative
to the root of the main UW site so the links end up going the same place
they did in the original page.
"""
parsed = urlparse (url)
if parsed[0:5] != ('',) * 5:
url = urljoin ('https://uwaterloo.ca', url)
return url
[docs]def fix_template (template, template_dir):
"""Make required adjustments and corrections to the template
"""
# Clear space for login/logout link
template.find ('div', id="cas_login").clear ()
# Fix relative URLs to be absolute
for href in template.find_all (href=True):
href['href'] = fix_url (href['href'])
for href in template.find_all (src=True):
href['src'] = fix_url (href['src'])
# Replace home page link if needed
sitelink = (template
.find ('div', id="site-header")
.find ('div', class_="uw-section--inner")
.a)
sitetitlefile = template_dir / 'sitetitle'
if sitetitlefile.exists ():
sitetitle = sitetitlefile.read_text ().strip ()
sitelink.string = sitetitle
sitelink['title'] = sitetitle
else:
sitetitle = sitelink.string.strip ()
sitehomefile = template_dir / 'sitehome'
if sitehomefile.exists ():
sitelink['href'] = sitehomefile.read_text ().strip ()
# Blank out breadcrumbs
breadcrumbdiv = (template
.find ('div', id="main")
.find ('div', class_="uw-site--breadcrumb")
.find ('nav', class_="breadcrumb")
.find ('ol'))
breadcrumbdiv.clear ()
# Insert reference to fonts.css
# **TODO: Make this adjustment when and only when deploying to non-Odyssey
# servers, rather than looking at which site is involved.
if template_dir.name.startswith ('odyssey'):
server = ''
else:
server = '//odyssey.uwaterloo.ca'
template.head.append (template.new_tag ('link',
href=server + "/static/wcms/main.css",
media="all",
rel="stylesheet",
type="text/css"
))
# Clear out menu, including "Information For"
mainmenu = template.find ('div', id="main-menu")
mainmenu.clear ()
menuparent = mainmenu.parent
menuparent.clear ()
menuparent.append (mainmenu)
# Blank out content areas
template.title.clear ()
titlespan = template.new_tag ('span', id="wcms-main-title")
template.title.append (titlespan)
template.title.append (' | %s | University of Waterloo' % sitetitle)
template.find ('div', class_="uw-site--title").h1.clear ()
template.find ('div', id="site-sidebar-wrapper").find ('div', class_="content").clear ()
# This removes the content, but also removes the sharing links for
# reasons that are not entirely clear.
template.find ('div', id="content").clear ()
# The sharing links usually get installed into the following, but they
# aren't that useful anyway in our context so just remove it all:
template.find ('div', id="uw-site-share").decompose ()
# Remove unneeded metadata
template.body['class'] = [c for c in template.body['class']
if not c.startswith ("page-node-")]
for item in ([]
+ template.head.find_all ('link', rel=["canonical", "shortlink"])
+ template.head.find_all ('meta', property=["og:url", "og:title", "og:description", "og:updated_time", "article:published_time", "article:modified_time"])
+ template.head.find_all ('meta', attrs={'name': ["description", "twitter:url", "twitter:title", "twitter:description"]})
):
item.decompose ()
# Insert footer associated with this template
footerfile = template_dir / 'footer'
if footerfile.exists ():
footerdiv = (template
.find ('div', id="site-footer")
.find ('div', class_="uw-section--inner"))
footerdiv.clear ()
footer = BeautifulSoup (footerfile.read_text (), 'lxml')
if footer.body is not None:
footerbody = footer.body.extract ()
footerdiv.append (footerbody)
footerbody.unwrap ()
# Insert extra <head> contents associated with this template
headerfile = template_dir / 'header'
if headerfile.exists ():
header = BeautifulSoup (headerfile.read_text (), 'lxml')
if header.head is not None:
headerhead = header.head.extract ()
template.head.append (headerhead)
headerhead.unwrap ()
[docs]def process_template (template_dir, download_dir, output_dir):
"""Process a single template.
:param template_dir: The directory with information specifying the
details of the template to process. Must contain a file specifying
the source URL of the template.
:param download_dir: The base directory for the template download. Must
contain the template file downloaded by the download process.
:param output_dir: The directory for the processed template.
Reads the source URL and uses it to find the input template. Processes the
template document and writes the output to a corresponding filename in the
output directory.
"""
urlfile = template_dir / 'source'
url = urlfile.read_text ()
m = url_re.match (url)
if m is None:
raise ValueError ('Invalid URL in %s: %s', urlfile, url)
# Compute path of downloaded template file
rawfile = download_dir / 'uwaterloo.ca' / m.group ('site') / m.group ('path')
if not m.group ('path') or m.group ('path').endswith ('/'):
rawfile /= 'index.html'
# Parse template
template = BeautifulSoup (rawfile.read_text (), 'lxml')
# Adjust template contents
fix_template (template, template_dir)
# Save adjusted template
(output_dir / template_dir.name).write_text (str (template))
[docs]def process_templates (datadir, download_dir, output_dir):
"""Process all templates.
:param datadir: The base directory containing a directory for each
template with information specifying the details of the template.
:param download_dir: The base directory for the template download.
:param output_dir: The directory for the processed templates.
"""
for template_dir in datadir.iterdir ():
print (template_dir)
process_template (template_dir, download_dir, output_dir)
datadir = Path.home () / 'git/odyssey/project/wcms/data'
wcmsbasedir = Path.home () / 'wcms'
downloaddir = wcmsbasedir / 'download'
templatedir = wcmsbasedir / 'template'
[docs]def main ():
process_templates (datadir, downloaddir, templatedir)