#!/usr/bin/python3 # -*- coding: utf-8 -*- root = '/var/www' owner = 'Oskar Skog' my_url = '/read/' canonical_url = 'https://__HOST__/read/' html403file = '/var/www/oops/403.html' html404file = '/var/www/oops/404.html' html503file = '/var/www/oops/cgi503.html' import sys sys.path.append(root) import cgi import os import errno import compressout import base64 import re import time import htmlescape import string import spammy import sitemap as mod_sitemap # Name conflict with already existing function. import cgitb cgitb.enable() rootlen = len(root) #html_mime = 'text/html' # Set to XHTML later. html_page = 'Content-Type: text/html; charset=UTF-8\n' # Set to XHTML later. conf = eval(open('read.cfg').read()) def redirect_spam(destination): '''`destination` is the URL to which assholes should be redirected.''' compressout.write_h('Status: 303\n') compressout.write_h('Location: {}\n'.format(destination)) compressout.write_h('\n') def status400(message): '''HTTP 400; `message` goes UNESCAPED inside a
 element.'''
    compressout.write_h('Status: 400\n')
    compressout.write_h(html_page)
    compressout.write_h('\n')
    compressout.write_b('''__HTML5__
        400 - Bad Request
    
    
        __NAVIGATION__
        

400 - Bad Request

{}

Your request can't be understood. Check the parameters.

Documentation for the parameters

'''.format(message)) compressout.write_b(''' __FOOTER__ ''') def status403(): '''HTTP 403''' compressout.write_h(html_page) compressout.write_h('Status: 403\n\n') compressout.write_b(open(html403file).read()) def status404(): '''HTTP 404''' compressout.write_h('Status: 404\n') compressout.write_h(html_page) compressout.write_h('\n') compressout.write_b(open(html404file).read()) def status503(): ''' HTTP 503 Call this if there is too much load on the server to do something. (Used by the sitemap function.) ''' compressout.write_h('Status: 503\n') compressout.write_h(html_page) # One factor is load avg for 1 minute, add some slop to the delay for bots. compressout.write_h('Retry-After: 90\n') compressout.write_h('\n') compressout.write_b(open(html503file).read()) def index_page(): '''https://oskog97.com/read/''' # Handle 304s. ETag = '"{}{}{}"'.format( 'x'*('application/xhtml+xml' in html_page), 'z'*('gzip' in os.getenv('HTTP_ACCEPT_ENCODING', '')), os.stat('index.py').st_mtime, ) compressout.write_h('Vary: If-None-Match\n') compressout.write_h('ETag: {}\n'.format(ETag)) compressout.write_h(html_page) if os.getenv('HTTP_IF_NONE_MATCH') == ETag: compressout.write_h('Status: 304\n\n') return compressout.write_h('\n') if os.getenv('REQUEST_METHOD') == 'HEAD': return # Write out a static page. compressout.write_b('''__HTML5__ __TITLE__ __NAVIGATION__
__H1__ ''') compressout.write_b('''

Interested in the scripts I have on my website? Go take a look at them; start crawling the root directory or take a look at the (sub)sitemap.

Parameter syntax

Descriptions for the parameters can be found in the request forms.

  • Asterisks * represent a value that can be (almost) anything.
  • Square brackets [] represent optional.
  • Curly brackets {} represent mandatory.
  • Pipes | represent either or.

There are three acceptable "sets" of parameters:

  1. {0}?sitemap={html|xml}
  2. {0}?path=*[&download=yes]
  3. {0}?path=*[&referer=*[&title=*]]

The order of the valid parameters doesn't matter, but this is the recommended/canonical order.

Request forms

Notice that these are three different forms.

Sitemap

The sitemap parameter can be either html, xml or the default none. It can't be used together with any other parameters.

Request an HTML sitemap instead of a page
request an XML sitemap instead of a page

Page

A page (source code of a CGI script) is selected with the path parameter. The value of the path parameter is a URL relative to this site, ie. an URL beginning with a single slash.

The path is the site-local URL to the CGI script or directory you're interested in. If you set the value to /read/index.py, you'll get the source code for this script. And if you set it to /, you'll get a directory listing of the site's root directory.

Path/URL:
Download / see it as plain text

The download parameter can be set to either yes or the default no. The download option does obviously not work with directories.

Link back to a referencing page

If download is no or unset and a page (not a sitemap) was requested, it is possible to change the navigation to make the requested page link back to a referring page.

The referer (yes, misspelled like the HTTP Referer) parameter is the URL of the referencing page. (Don't try to specify a site that isn't mine.) The title parameter gives the back link a different text than Back.

path
referer
title
'''.format(my_url)) compressout.write_b(''' __FOOTER__ ''') def noindex(path): ''' Returns True if `path` should be noindexed. `path` is an absolute **filesystem** path. ''' def isword(w): letters = string.ascii_letters + ',.' for ch in w: if w not in letters: return False return True # 1. White list # 2. Black list # 3. Page quality (not applicable for directories) # Check whitelist first. for regex in conf['doindex']: if re.match(regex, path[rootlen:]) is not None: return False break # Blacklist (two kinds): # - Generated from another file. # - Explicitly blacklisted in 'read.cfg'. for match, replace in conf['madefrom']: if re.match(match, path[rootlen:]) is not None: try: os.stat(root + re.sub(match, replace, path[rootlen:])) return True except: pass for regex in conf['noindex'] + conf['hide']: if re.match(regex, path[rootlen:]) is not None: return True # Quality: # - Text file # - At least 3072 Unicode code points # - At least 300 words # - At least 60 lines # - Half the limitations if a meta description and title is found # - A third of the limimitations if an onpage description is found try: os.listdir(path) return False except: pass # Normal file. try: if sys.version_info[0] > 2: text = open(path).read() else: text = open(path).read().decode('utf-8') except: return True min_chars, min_words, min_lines, min_comments = 3072, 300, 60, 24 quality = mk_description(path)[0] + 1 min_chars //= quality; min_words //= quality min_lines //= quality; min_comments //= quality if len(text) < min_chars: return True if text.count('\n') + 1 < min_lines: return True n_comments = 0 is_comment = re.compile('^(.*#.*| *\\* .*|.* ''', URL=(2, referer[0]), title=(1, referer[1]), me=(1, title), my_url=(0, my_url), ) else: return '''__NAVIGATION__''' def mk_referer_param(referer): '''Returns one of: '' '&referer=' + referer[0] '&referer=' + referer[0] + '&title=' + referer[1] to be added to links from the requested page. `referer` is used to **optionally** ``integrate`` a page. See `mk_navigation` ''' if referer[0]: if referer[1] != 'Back': title = '&title={}'.format(referer[1]) else: title = '' return '&referer={}{}'.format(referer[0], title) else: return '' def mk_description(path): ''' Return three strings: (good, title, meta_description, onpage_description) `path` is the absolute filesystem path to the requested page. `good` is 0 no title and description 1 title and meta description only 2 also an onpage description `title` is the title of the page. `meta_description` is the content of the description meta tag. `onpage_description` is HTML content for the onpage description. requested page. ''' good = 0 title = "source code of {}".format(path[rootlen:]) meta_description = '' onpage_description = None try: content = open(path + '.info').read().split('\n') good = 1 except: pass if good: title = content[0] try: sep = content.index('.') except ValueError: sep = None if sep is not None: good = 2 meta_description = '\n'.join(content[1:sep]) onpage_description = '\n'.join(content[sep+1:]) else: meta_description = '\n'.join(content[1:]) if onpage_description is None: onpage_description = htmlescape.escape('

{}

',1,meta_description) return good, title, meta_description, onpage_description def sitemap(sitemap_type): ''' Write out an XML or HTML sitemap. sitemap_type in ('xml', 'html') The XML sitemap will exclude entries from `conf['noxmlsitemap']`. ''' if os.getenv('REQUEST_METHOD') != 'HEAD': # NOTICE # Prevent over-revving the server. # HEAD requests are basically no-ops. maxload = conf['sitemap-maxload'] if os.getloadavg()[0] > maxload['load-avg1']: status503() return try: access_times = list(map( float, open('read.throttlecontrol').read().strip().split(':') )) except: access_times = [0] if time.time() - access_times[-1] < maxload['throttle-time']: status503() return access_times.insert(0, time.time()) access_times = access_times[:maxload['throttle-requests']] f = open('read.throttlecontrol', 'w') f.write(':'.join(list(map(str, access_times))) + '\n') f.close() # Write headers before doing anything else. # A HEAD request doesn't need to know the length (it's TE chunked). if sitemap_type == 'xml': compressout.write_h('Content-Type: application/xml; charset=UTF-8\n') compressout.write_h( 'Link: <{my_url}?sitemap=html>'.format(my_url=canonical_url) + '; rel="canonical"' + '; type="text/html"\n' ) compressout.write_h('X-Robots-Tag: noindex\n\n') # NOTE: last. elif sitemap_type == 'html': compressout.write_h(html_page) compressout.write_h('\n') else: assert False, "Neither 'xml' nor 'html'" if os.getenv('REQUEST_METHOD') == 'HEAD': # NOTICE return # Find the pages worth being in the sitemap. no_access = conf['noaccess'] + conf['hide'] + conf['topsecret'] paths = [] for basedir, dirs, files in os.walk(root, topdown=True): # Exclude hidden directories: remove_list = [] sys.stderr.write('In {}\n'.format(basedir)) sys.stderr.write('Dirs: {}\n'.format(repr(dirs))) for dirname in dirs: dirpath = os.path.join(basedir, dirname)[rootlen:] for regex in no_access: if re.match(regex, dirpath) is not None: #dirs.remove(dirname) # BUG: The for loop will skip items in the list if # other items are removed while looping. # This caused some real' nasty stuff like sshin to # be crawled, took a whopping .65 seconds. remove_list.append(dirname) break sys.stderr.write('Removed dirs: {}\n'.format(repr(remove_list))) for dirname in remove_list: dirs.remove(dirname) # Iterate over files: for filename in files: filepath = os.path.join(basedir, filename) # No symlinks allowed. #if os.stat(filepath).st_mode == os.lstat(filepath).st_mode: if not os.path.islink(filepath): #try: description = mk_description(filepath) if description[0]: # Only indexable content allowed. if not noindex(filepath): paths.append((filepath[rootlen:], description[3])) else: sys.stderr.write('{} is noindexed\n'.format(filepath)) else: sys.stderr.write('{} has no description\n'.format(filepath)) #except IOError as error: #assert error.errno in ( #errno.EISDIR, errno.EACCES #), error.errno else: sys.stderr.write('{} is link\n'.format(filepath)) paths.sort(key=lambda x: x[0]) # Print the body. if sitemap_type == 'xml': compressout.write_b(''' ''') # for path, description in paths: # Loop through all the regexes: for regex in conf['noxmlsitemap']: if re.match(regex, path) is not None: break else: compressout.write_b(htmlescape.escape(''' {canonical_url}?path={path} 0.5 ''', canonical_url=(0, canonical_url), path=(1, path), )) mod_sitemap.lastmod_changefreq( root + path, compressout, ) compressout.write_b('\n') # compressout.write_b('\n') elif sitemap_type == 'html': compressout.write_b('''__HTML5NC__ Sitemap for scripts' source code __NAVIGATION__

Sitemap for scripts' source code

Root directory

'''.format(my_url=my_url, canonical_url=canonical_url)) # indent = 16 * ' ' for path, description in paths: compressout.write_b(indent + htmlescape.escape( '''
{path}
\n''', path=(0, path), my_url=(0, canonical_url), )) compressout.write_b(indent + htmlescape.escape('
{}
\n', 0, description) ) # compressout.write_b('''
__FOOTER__ ''') else: assert False, "Neither 'xml' nor 'html'" def ls(path, referer): ''' ''' compressout.write_h(html_page) compressout.write_h('\n') if os.getenv('REQUEST_METHOD') == 'HEAD': return compressout.write_b('''__HTML5NC__''') compressout.write_b(htmlescape.escape(''' Index of {name} {navigation}

Index of {name}

{isroot_commentout_start} Parent directory {isroot_commentout_end} CGIread sitemap Main page

''', name =(1, path[rootlen:] + '/'), parent_path =(2, '/'.join(path.split('/')[:-1])[rootlen:]+'/'), robots_follow =(2, 'no'*noindex(path)+'follow'), navigation =(0, mk_navigation( referer, "Index of "+path[rootlen:]+'/' )), referer_params=(2, mk_referer_param(referer)), my_url=(0, my_url), canonical_url=(0, canonical_url), isroot_commentout_start=(0, ''*(path == root)), )) no_access = conf['noaccess'] + conf['hide'] + conf['topsecret'] for x in sorted(os.listdir(path)): full_path = os.path.join(path, x) forbidden = False for regex in no_access: if re.match(regex, full_path[rootlen:]) is not None: forbidden = True break if forbidden: continue #url = cgi.escape(full_path, quote=True) try: os.listdir(full_path) is_dir = 1 except: is_dir = 0 # mobile_desc # desktop_desc if is_dir: mobile_desc = '->' desktop_desc = 'Directory' else: try: content = open(full_path).read() # This fails on Python 3 !!! if sys.version_info[0] == 2: content.decode('UTF-8') binary = False except: binary = True if binary: desktop_desc = 'Binary' mobile_desc = ':-(' else: good, title, meta_d, onpage_d = mk_description(full_path) if good == 2: desktop_desc = htmlescape.escape( '{}', 1, meta_d ) if noindex(full_path): mobile_desc = ':-)' else: mobile_desc = ':-D' elif not noindex(full_path): mobile_desc = ':-)' if compressout.debug_cookie: desktop_desc = 'Text; indexable' else: desktop_desc = 'Text' else: mobile_desc = ':-|' if compressout.debug_cookie: desktop_desc = 'Boring; unindexable' else: desktop_desc = 'Looks boring' compressout.write_b( htmlescape.escape( ''' ''', site=(0, my_url), path=(2, full_path[rootlen:] + '/'*is_dir), referer=(2, mk_referer_param(referer)), text=(1, x + '/'*is_dir), mobile_desc=(0, mobile_desc), desktop_desc=(0, desktop_desc), ) ) compressout.write_b('''
{mobile_desc} {text} {desktop_desc}
__FOOTER__ \n''') def download(path): if noindex(path): compressout.write_h('X-Robots-Tag: noindex\n') else: compressout.write_h('X-Robots-Tag: index\n') # For verbosity. try: content = open(path).read() if sys.version_info[0] == 2: content.decode('utf-8') compressout.write_h('Content-Type: text/plain; charset=UTF-8\n') compressout.write_h(htmlescape.escape( 'Link: <{}?path={}>', 0, canonical_url, 2, path[rootlen:] ) + '; rel="canonical"; type="text/html"\n' ) except: compressout.write_h(htmlescape.escape( 'Link: <{}?path={}>; rel="canonical"\n', 0, canonical_url, 2, path[rootlen:] )) # No type specified. if if_none_match(path): compressout.write_h('\n') if os.getenv('REQUEST_METHOD') != 'HEAD': compressout.write_b(content) def cat(path, referer): ''' ''' def ol_content(text): out_lines = [] ids = [] allowed_chars = string.ascii_letters + '_-' for index, line in enumerate(text.split('\n')): # Create a "permanent" fragment this line. this_id = '' # Find ids in Python and XHTML for decltype in ('def', 'class'): if line.strip().startswith(decltype + ' ') and '(' in line: this_id = line.split(decltype, 1)[1].split('(')[0].strip() if 'id="' in line: this_id = line.split('id="')[1].split('"')[0] # Prevent bad ids. for ch in this_id: if ch not in allowed_chars: this_id = '' break if this_id in ids: this_id = '' # Create the fragment identifier for the line. if this_id: ids.append(this_id) idline = 'id="content_{}"'.format(this_id) else: idline = '' # Create line out_lines.append(htmlescape.escape( '
  • {}
  • \n', 0, index + 1, 0, idline, 1, line, )) fragment_links = [] for fragment in sorted(ids): fragment_links.append( ( '{0}\n' ).format( fragment ) ) return ''.join(out_lines), ''.join(fragment_links) try: content = open(path).read() if sys.version_info[0] == 2: content.decode('utf-8') except: if noindex(path): compressout.write_h('X-Robots-Tag: noindex\n') else: compressout.write_h('X-Robots-Tag: index\n') compressout.write_h('\n') compressout.write_b(content) return compressout.write_h(html_page) compressout.write_h('\n') if os.getenv('REQUEST_METHOD') == 'HEAD': return ignore, title, meta_description, p_description = mk_description(path) last_modified = time.strftime('%F', time.gmtime(os.stat(path).st_mtime)) lines, fragment_links = ol_content(content) if not fragment_links: fragment_links = '(none)' compressout.write_b('''__HTML5NC__''') compressout.write_b(''' ''') parent_link = '/'.join(path.split('/')[:-1])[rootlen:]+'/' compressout.write_b(htmlescape.escape(''' {title} {navigation}

    {title}

    {content_description}
    {begin_debug}{end_debug}
    Last modified
    Lines {linecount}
    Indexable {indexable}

    Parent directory Download CGIread sitemap Main page

    Quick links:\n{fragments}

      {content}
    ''', title=(2, title), content=(0, lines), parent_dir=(2, parent_link + mk_referer_param(referer)), navigation=(0, mk_navigation(referer, path[rootlen:])), canonical=(2, canonical_url + '?path=' + path[rootlen:]), path=(2, path[rootlen:]), noindex_no=(2, 'no' * noindex(path)), meta_description=(2, meta_description), content_description=(0, p_description), last_modified=(2, last_modified), linecount=(1, content.count('\n') + 1), indexable=(0, {True: 'No', False: 'Yes'}[noindex(path)]), fragments=(0, fragment_links), my_url=(0, my_url), begin_debug=(0,['',''][compressout.debug_cookie]), )) compressout.write_b(''' __FOOTER__ ''') def if_none_match(path): ''' ETag handling for `cat`, `ls` and `download`: Returns `True` if content needs to be generated. Outputs necessary headers and 304 statuses. ''' try: meta_time = os.stat(path + '.info').st_mtime except: meta_time = 0 if sys.version_info[0] > 2: query_string = os.getenv('QUERY_STRING', '').encode('utf-8') else: query_string = os.getenv('QUERY_STRING', '') ETag = '"{}{}-{}({})-{}-({}-{})"'.format( 'x'*('application/xhtml+xml' in html_page), 'z'*('gzip' in os.getenv('HTTP_ACCEPT_ENCODING', '')), os.stat(path).st_mtime, meta_time, base64.b64encode(query_string), os.stat('index.py').st_mtime, os.stat('read.cfg').st_mtime, ) compressout.write_h('Vary: If-None-Match\n') compressout.write_h('ETag: {}\n'.format(ETag)) compressout.write_h( '''X-ETag-Synopsis: [x][z]-()--(-) X-ETag-Description-x: "Client accepts application/xhtml+xml" X-ETag-Description-z: "Content-Encoding: gzip" X-ETag-Description-f_time: "Unix last modified time for the requested file" X-ETag-Description-m_time: "Unix last modified time for the file's metadata" X-ETag-Description-query: "base64 encoded $QUERY_STRING" X-ETag-Description-s_time: "Unix last modified time for '/read/index.py'" X-ETag-Description-c_time: "Unix last modified time for '/read/read.cfg'" ''') if os.getenv('HTTP_IF_NONE_MATCH', '') == ETag: compressout.write_h('Status: 304\n\n') return False else: return True def is_injection_attempt(path_param, referer_URI, referer_title): ''' Various checks to see if any form of injection attempt has been made. This function checks the `path`, `referer` and `title` parameters. Returns True if the request is an injection attempt. - XSS - URL injection - Spam injection - Restricted files access ''' # If the path parameter contains an XSS attempt, it can't be corrected evil = False # Prevent attacks. if '..' in path_param: return True for var in referer_URI, referer_title: for ch in var: if ord(ch) < 32: return True if ch in '<>&\'"': return True # NOTICE: The following will limit parameters to ASCII. if ord(ch) > 126: return True # Prevent linking to Mallory. for start in ('http://', 'https://', '//', 'ftp://'): if referer_URI.startswith(start): hostname = referer_URI.split('//')[1].split('/')[0] if hostname not in conf['allowed-referer-hosts']: return True else: break else: if ':' in referer_URI: return True # Prevent injected spam if spammy.spammy(referer_title) or len(referer_title) > 42: return True # No match. return False def handle_injection_attempt(path_param, referer_URI, referer_title): ''' Decide if the injection attempt was due to innocently following a malicious link or due to creating one. ''' # Check if the URL can be sanitized. if is_injection_attempt(path_param, '', ''): destination = 'https://en.wikipedia.org/wiki/Data_validation' else: destination = my_url + '?path=' + path_param redirect_spam(destination) def main(): ''' `compressout.init` MUST be called before `main` and `compressout.done` after. ''' # HTML vs XHTML global html_page html_page = 'Vary: Accept\n' if 'application/xhtml+xml' in os.getenv('HTTP_ACCEPT', ''): html_page += 'Content-Type: application/xhtml+xml; charset=UTF-8\n' else: html_page += 'Content-Type: text/html; charset=UTF-8\n' # Check that the method is either GET, HEAD or OPTIONS. if os.getenv('REQUEST_METHOD') not in ('GET', 'HEAD'): if os.getenv('REQUEST_METHOD') != 'OPTIONS': compressout.write_h('Status: 405\n') compressout.write_h('Allow: GET, HEAD, OPTIONS\n') compressout.write_h('Content-Type: text/plain\n') compressout.write_h('\n') if os.getenv('REQUEST_METHOD') != 'OPTIONS': compressout.write_b('Method not allowed!\n') compressout.write_b('Allowed methods: GET, HEAD, OPTIONS\n') return # Get the parameters. params = cgi.FieldStorage() path = path_param = params.getfirst('path', default='') referer_URI = params.getfirst('referer', default='') referer_title = params.getfirst('title', default='Back') referer = (referer_URI, referer_title) download_flag = params.getfirst('download', default='no') sitemap_param = params.getfirst('sitemap', default='none') if not os.getenv('QUERY_STRING'): index_page() return # Bad request, but will match the evil patterns. # Keep it before the evil stopper. if bool(path_param) and not path_param.startswith('/'): status400('`path` is not relative to this site. (No leading slash.)') return # Do not allow evil requests. allow = True # Keep things within the server root. try: path = os.path.realpath(root + path) except: allow = False if path != root and not path.startswith(root + '/'): allow = False # Stop at forbidden paths. #1/2 for regex in conf['noaccess']: if re.match(regex, path[rootlen:]) is not None: allow = False # Prevent XSS, URL injection, spam injection and miscellaneous assholery. if is_injection_attempt(path_param, referer_URI, referer_title): allow = False if not allow: handle_injection_attempt(path_param, referer_URI, referer_title) return # Bad requests: if download_flag not in ('yes', 'no'): status400('`download` MUST be "yes", "no" or unset.') return if bool(path_param) and sitemap_param != 'none': status400('The `sitemap` parameter cannot be used with any other.') return if download_flag == 'yes' and bool(referer_URI): status400("`download=yes` can't be used with the `referer` parameter.") return if sitemap_param not in ('none', 'xml', 'html'): status400('`sitemap` MUST be "html", "xml" or unset.') return if download_flag == 'yes' and not bool(path_param): status400('Nothing to `download`. Use the `path` parameter.') return if bool(referer_URI) and not bool(path_param): status400('`referer` cannot be used without `path`') return if referer_title != 'Back' and not bool(referer_URI): status400('`referer` is not set.') return if allow: # Generate sitemap? if sitemap_param != 'none': sitemap(sitemap_param) else: # Stop at forbidden paths. #2/2 for regex in conf['topsecret']: if re.match(regex, path[rootlen:]) is not None: status404() break else: # Allowed to be seen. try: os.listdir(path) if download_flag == 'no': if if_none_match(path): ls(path, referer) else: status400("Can't download a directory.") except OSError as e: if e.errno == errno.ENOTDIR: if download_flag == 'no': if if_none_match(path): cat(path, referer) else: # `download` sets a few headers. download(path) elif e.errno == errno.ENOENT: status404() else: raise ValueError( 'errno must be either ENOTDIR or ENOENT' ) if __name__ == '__main__': compressout.init() main() compressout.done()