Tridecane page maker

Create joke pages for the unpreferred www and tridecane subdomains. The pages will link to the intended page.

See the doc-string for `main` for more information.

Last modified	2022-08-15
Lines	387

Parent directory Download CGIread sitemap Main page

Quick links: check_output_nofail global-variables main make_html_link overloaded page

```
#!/usr/bin/python
```
```
# Copyright (c) 2016, Oskar Skog
```

# This file is released under the 2-clause BSD license,

```
# a.k.a. the FreeBSD license
```

# URL: <http://oskog97.com/policy.html#license>

```
'''
```

Create joke pages for the unpreferred www and tridecane subdomains.

The pages will link to the intended page.

See the doc-string for `main` for more information.

Try it at: http://tridecane.oskog97.com/read/?path=/tridecane/index.cgi

```
'''
```
```
import sys
```
```
sys.path.append('/var/www') # NOTICE
```
```
import htmlescape
```
```
import compressout
```
```
import os
```
```
import base64
```
```
import re
```
```
import subprocess
```
```
import time
```
```
import cgitb
```
```
cgitb.enable()
```
```
#id="global-variables"
```
```
# Global variables
```
```
host = 'oskog97.com'
```
```
proto = 'https'
```
```
base_dir = 'tridecane'
```
```
fs_base_dir = '/var/www/tridecane'
```

referer_site = 'http://tridecane.oskog97.com'

```
user_agent = ('Tridecane linkmaker ' +
```

    'https://oskog97.com/read/?path=/tridecane/index.cgi')

```
redirect = [
```

    (r'^/favicon\.ico$', r'/favicon.png'),

    (r'^/sitemap(.*)\.xml$', r'/sitemap\1.xml'),

    (r'^/read\.py\?(.*)sitemap=xml(&.*)?$', r'/read.py?sitemap=xml'),

    (r'^/google(.*)\.html$', r'/google\1.html'),

```
]
```

# '*' means that QUERY_STRING must not be altered

# while a *list* of strings means that only that listed parameters can

# be used and should only be used in the specified order.  (Mutually

# exclusive parameters are NOT handled.)

```
parameters = {
```

    '/read/': ['sitemap', 'path', 'download', 'referer', 'title'],

```
    '/test.cgi': '*',
```
```
}
```

# I don't want the system to be overloaded, so I'll put some

```
# restrictions here.
```
```
maxload = {
```

    'throttle-file': '/var/www/tridecane/throttle',

```
    'throttle-requests': 3,
```
```
    'throttle-time': 6,
```
```
    'load-avg-1': 3.5,
```
```
    'retry-after': 90,
```

    '503-file': '/var/www/oops/cgi503.html',

```
}
```
```
def make_html_link(destination):
```
```
    '''
```

    Create html code (inline, doesn't come pre-wrapped in a p element)

```
    with a link to `destination`.
```
```
    
```
```
    `destination` MUST be a valid URL.
```
```
    
```

    The link text will be pulled from the target page's title if any.

```
    
```

    If the target responds with 4xx a link to the homepage will be

    returned instead.  The hostname and protocol for the homepage

    are defined in the globabl variables `host` and `proto`.

```
    
```

    This function requires the HEAD(1) and GET(1) command from

```
    lwp-request(1).
```
```
    '''
```
```
    def check_output_nofail(*args):
```
```
        cmd = ' '.join(map(
```

            lambda arg: "'" + arg.replace("'", "'\"'\"'") + "'",

```
            args
```
```
        ))
```

        return subprocess.check_output(cmd + ' || true', shell=True)

```
        
```
```
    head = check_output_nofail(
```
```
        'HEAD',
```

        '-H', 'User-Agent: ' + user_agent,

        '-H', 'Referer: ' + referer_site + os.getenv('REQUEST_URI'),

```
        destination,
```
```
    )
```

    # NOTICE: `status` is only the first digit.

```
    status = head[0]
```
```
    if status == '4':
```

        html = """The page you're looking for doesn't seem to exist.

            Would you like to go to the <a href="{}">homepage</a> instead?

            """.format(proto + '://' + host + '/')

```
    elif status == '5':
```
```
        html = htmlescape.escape(
```

            '''&lt;<a href="{}" rel="nofollow">{}</a>&gt; is temporarily

            malfunctioning.  Try again later.''',

```
            2, destination,
```
```
            1, destination,
```
```
        )
```
```
    elif status != '2':
```
```
        html = htmlescape.escape(
```

            'Unknown error with &lt;<a href="{}" rel="nofollow">{}</a>&gt;',

```
            2, destination,
```
```
            1, destination,
```
```
        )
```
```
    else:
```
```
        ishtml = (
```

            ('Content-Type: text/html' in head) or

            ('Content-Type: application/xhtml+xml' in head)

```
        )
```
```
        # Fetch the body if HTML.
```
```
        if ishtml:
```
```
            body = check_output_nofail(
```
```
                'GET',
```

                '-H', 'User-Agent: ' + user_agent,

                '-H', 'Referer: ' + referer_site + os.getenv('REQUEST_URI'),

```
                destination,
```
```
            )
```
```
        else:
```
```
            body = ''
```

        # Use the HTML title if available.

        if '<title>' in body and '</title>' in body:

            title = body.split('<title>')[1].split('</title>')[0]

```
        else:
```

            title = htmlescape.escape('{}', 1, destination)

```
        #
```

        html = htmlescape.escape('''Perhaps you're looking for

            <a href="{}" rel="nofollow">{}</a> (without the www)?''',

```
            2, destination,
```
```
            0, title,
```
```
        )
```
```
    return html
```
```
    
```
```
def page(request_uri):
```
```
    '''
```

    Print out the page for the (partially) canonicalized `request_uri`.

```
    
```

    This function assumes `compressout.init` has already been called

    and that `compressout.done` will be called after returning.

```
    '''
```

    xhtml_mime = 'application/xhtml+xml'

```
    mime = 'text/html'
```

    if xhtml_mime in os.getenv('HTTP_ACCEPT', ''):

```
        mime = xhtml_mime
```

    compressout.write_h('Status: 404\n')

    compressout.write_h('Content-Type: {}; charset=UTF-8\n'.format(mime))

```
    compressout.write_h('\n')
```

    compressout.write_b('''<!DOCTYPE html>

<html lang="en" xmlns="http://www.w3.org/1999/xhtml">

```
    <head>
```
```
        <meta charset="utf-8"/>
```

        <meta name="robots" content="noindex"/>

        <meta name="viewport" content="width=device-width, initial-scale=1"/>

        <link rel="icon" type="image/png"

            href="{proto}://{host}/{base_dir}/favicon.png"/>

```
        <style type="text/css">
```
```
            {stylesheet}
```
```
        </style>
```

        <title>Sorry, but I'm not /that/ interested in chemistry</title>

```
    </head>
```
```
    <body>
```
```
        <p class="skeletal">
```

            <a href="https://en.wikipedia.org/wiki/Tridecane" rel="nofollow"

```
            ><img 
```

                src="{proto}://{host}/{base_dir}/skeletal.png"

                alt="(skeletal formula for tridecane) www"

```
                width="150" height="24"
```
```
            /></a>.{host}
```
```
        </p>
```

        <h1>Sorry, but I'm not <em>that</em> interested in chemistry</h1>

```
        <p>
```

            <a href="https://en.wikipedia.org/wiki/Tridecane" rel="nofollow"

            ><img src="{proto}://{host}/{base_dir}/model.png"

                alt="(Image): balls and sticks model of tridecane"

                width="800" height="249"

```
            /></a><br/>
```
```
            <a rel="nofollow"
```

href="https://en.wikipedia.org/wiki/File:Tridecane_3D_ball-and-stick_model.png"

            >(Image is taken from Wikipedia. License: CC-BY-SA)</a>

```
        </p>
```
```
'''.format(
```

        host=host, proto=proto, base_dir=base_dir,

        stylesheet=open(fs_base_dir + '/style.css').read(),

```
    ))
```

    compressout.write_b('<p class="link">\n{}\n</p>\n'.format(

        make_html_link(proto + '://' + host + request_uri)

```
    ))
```
```
    compressout.write_b('''
```

        <p>The acronym WWW has some interesting properties</p>

```
        <ul>
```
```
            <li>
```

                It takes approximately as long to pronounce WWW as it takes

                to pronounce "world wide web".

```
            </li>
```
```
            <li>
```

                It looks like the skeletal formula for tridecane, but

                "tridecane" is definitively shorter when pronounced.

```
            </li>
```
```
            <li>
```

                It's used to make many URLs four bytes longer for no good

```
                reason.
```
```
            </li>
```
```
        </ul>
```
```
        <p class="footer">
```

            Page made by <a rel="nofollow"

href="https://oskog97.com/read/?path=/tridecane/index.cgi&amp;referer=http://tridecane.oskog97.com/&amp;title=Back+to+the+tridecane+page"

```
            >Tridecane</a>.
```
```
        </p>
```
```
    </body>
```
```
</html>\n''')
```
```
def main():
```
```
    r'''
```

    Handle requests to the tridecane/www subdomain.

```
    
```
```
    - /robots.txt is served properly.
```
```
    - Certain URLs can be redirected.
```

    - Static pages will have parameters stripped out.

    - Dynamic pages will have the valid parameters sorted.

    - Dynamic pages that don't use the usual format for the query

```
      string are also supported.
```
```
    
```

    This function assumes `compressout.init` has already been called

    and that `compressout.done` will be called after returning.

```
    
```

    The global variable `redirect` is a list of tuples of

    (regex, replacement).  The replacement part follows the Python

    regex syntax with \1 \2 ... as back-references.

```
    
```

    The global variable `parameters` is a dictionary where the keys

    are the parts before '?' of the relative URLs to the dynamic pages.

    The value is either '*' which means that the query string will be

    untouched, or a list of strings where each string is a valid

    parameter/variable. The parameters on the canonicalized relative

    URL will come in the same order as specified in `parameters`.

```
    
```
```
    Misc global variables
```
```
    ---------------------
```
```
        
```

        `host`          The hostname for the canonical site version.

```
        
```

        `proto`         'http' or 'https'

```
        
```

        `base_dir`      Relative URL without leading and trailing slash;

                        where to find external files (images) for the

                        generated pages.

```
        
```

        `fs_base_dir`   Absolute filesystem path without trailing slash

                        to the directory `base_dir`.  (robots.txt and

                        style.css are supposed to be there.)

```
        
```

        `referer_site`  For setting the Referer HTTP header when

                        pulling in the title from the preferred site

```
                        version.
```

                        scheme://host (no trailing slash)

                        host is the hostname for the tridecane site.

```
    
```

        `user_agent`    For setting the User-Agent HTTP header when

                        pulling in the title from the preferred site

```
                        version.
```
```
        
```

        `maxload`       See the docstring for `overloaded`.

```
        
```
```
    
```
```
    '''
```

    request_uri = os.getenv('REQUEST_URI')

    #query_string = os.getenv('QUERY_STRING', '') # BUG

    compressout.write_h('Cache-Control: max-age=1209600\n')

```
    # /robots.txt
```
```
    if request_uri == '/robots.txt':
```
```
        try:
```

            robots_txt = open(fs_base_dir + '/robots.txt').read()

            compressout.write_h('Content-Type: text/plain\n\n')

            compressout.write_b(robots_txt.read())

```
        except IOError:
```

            compressout.write_h('Status: 404\n\n')

```
        return
```
```
    # Deal with redirections.
```
```
    for regex, replacement in redirect:
```

        if re.match(regex, request_uri) is not None:

            destination = re.sub(regex, replacement, request_uri)

            compressout.write_h('Status: 301\n')

```
            compressout.write_h(
```

                'Location: {proto}://{host}{destination}\n'.format(

                    host=host, proto=proto, destination=destination

```
                )
```
```
            )
```
```
            compressout.write_h('\n')
```
```
            return
```

    # Automatically canonicalize parameters.

```
    if '?' in request_uri:
```

        cgi_name, query_string = request_uri.split('?', 1)

```
        if cgi_name not in parameters:
```

            # Should not have any parameters.

```
            request_uri = cgi_name
```

        elif parameters[cgi_name] == '*':

            # Do not change the request_uri.

```
            pass
```
```
        else:
```
```
            # Auto-correct request_uri.
```
```
            valid_parameters = []
```

            for valid in parameters[cgi_name]:

                if (valid + '=') in query_string:

                    if query_string.startswith(valid + '='):

                        value = query_string.split('=', 1)[1]

```
                    else:
```

                        value = query_string.split('&' + valid + '=')[1]

                    value = value.split('&')[0]

                    valid_parameters.append(valid + '=' + value)

            request_uri = cgi_name + '?' + '&'.join(valid_parameters)

```
    # Let `page` print the actual page.
```
```
    page(request_uri)
```
```
    
```
```
def overloaded():
```
```
    '''
```
```
    Prevent over-revving the server.
```
```
    
```

    Returns True if a 503 page should be shown, and False if not.

```
    
```

    The global variable `maxload` is a dictionary containing:

```
    
```

        `load-avg-1`            Maximum average load during the last

```
                                minute.
```
```
        
```

        `throttle-file`         A file for recording the times of

                                the last `throttle-requests` requests.

                                Initial content SHOULD be '0\n', ie. a

```
                                zero.
```
```
        
```

        `throttle-requests`     The highest allowed number of requests

                                in `throttle-time` seconds.

```
        
```

        `throttle-time`         The shortest allowed time

                                `throttle-requests` requests are

                                allowed to be made.

```
        
```

        `retry-after`           Time in seconds for the Retry-After

                                HTTP header.

```
        
```

        `503-file`              The filesystem path to a static HTML

                                file with a Service Unavailable

                                message.

```
                                
```
```
    '''
```
```
    def status503():
```

        compressout.write_h('Status: 503\n')

        compressout.write_h('Content-Type: text/html; charset=UTF-8')

        compressout.write_h('Retry-After: {}\n'.format(maxload['retry-after']))

```
        compressout.write_h('\n')
```

        compressout.write_b(open(maxload['503-file']).read())

```
    
```

    if os.getloadavg()[0] > maxload['load-avg-1']:

```
        status503()
```
```
        return True
```
```
    try:
```
```
        access_times = map(
```

            float, open(maxload['throttle-file']).read().strip().split(':')

```
        )
```
```
    except:
```
```
        access_times = [0]
```

    if time.time() - access_times[-1] < maxload['throttle-time']:

```
        status503()
```
```
        return True
```
```
    access_times.insert(0, time.time())
```

    access_times = access_times[:maxload['throttle-requests']]

    f = open(maxload['throttle-file'], 'w')

    f.write(':'.join(map(str, access_times)) + '\n')

```
    f.close()
```
```
if __name__ == '__main__':
```
```
    compressout.init()
```
```
    if not overloaded():
```
```
        main()
```
```
    compressout.done()
```