CGI source code reader script
This is (the source of) the script that generates this very page.
Through this, you can see the source code for all the scripts on my site.
-
Requirements for a file for its /read/ page to be indexable by search
engines:
- Always indexable if whitelisted
- Not manually blacklisted
- Not made from another file
- Text file
- At least 3072/1536/1024 Unicode code points
- At least 300/150/100 "words"
- At least 60/30/20 lines
- At least 24/12/8 comments
Last modified | |
Lines | 1156 |
Parent directory Download CGIread sitemap Main page
Quick links: cat code content description download forms handle_injection_attempt if_none_match index_page is_injection_attempt isword ls main mk_description mk_navigation mk_referer_param navigation noindex ol_content redirect_spam sitemap syntax title
#!/usr/bin/python3
# -*- coding: utf-8 -*-
root = '/var/www'
owner = 'Oskar Skog'
my_url = '/read/'
canonical_url = 'https://__HOST__/read/'
html403file = '/var/www/oops/403.html'
html404file = '/var/www/oops/404.html'
html503file = '/var/www/oops/cgi503.html'
import sys
sys.path.append(root)
import cgi
import os
import errno
import compressout
import base64
import re
import time
import htmlescape
import string
import spammy
import sitemap as mod_sitemap # Name conflict with already existing function.
import cgitb
cgitb.enable()
rootlen = len(root)
#html_mime = 'text/html' # Set to XHTML later.
html_page = 'Content-Type: text/html; charset=UTF-8\n' # Set to XHTML later.
conf = eval(open('read.cfg').read())
def redirect_spam(destination):
'''`destination` is the URL to which assholes should be redirected.'''
compressout.write_h('Status: 303\n')
compressout.write_h('Location: {}\n'.format(destination))
compressout.write_h('\n')
def status400(message):
'''HTTP 400; `message` goes UNESCAPED inside a <pre> element.'''
compressout.write_h('Status: 400\n')
compressout.write_h(html_page)
compressout.write_h('\n')
compressout.write_b('''__HTML5__
<title>400 - Bad Request</title>
</head>
<body>
__NAVIGATION__
<main><div id="content">
<h1 id="title">400 - Bad Request</h1>
<pre>{}</pre>
<p>
Your request can't be understood.
Check the parameters.
</p>
<p><a href="/read/">Documentation for the parameters</a></p>
</div></main>
'''.format(message))
compressout.write_b('''
__FOOTER__
</body>
</html>''')
def status403():
'''HTTP 403'''
compressout.write_h(html_page)
compressout.write_h('Status: 403\n\n')
compressout.write_b(open(html403file).read())
def status404():
'''HTTP 404'''
compressout.write_h('Status: 404\n')
compressout.write_h(html_page)
compressout.write_h('\n')
compressout.write_b(open(html404file).read())
def status503():
'''
HTTP 503
Call this if there is too much load on the server to do something.
(Used by the sitemap function.)
'''
compressout.write_h('Status: 503\n')
compressout.write_h(html_page)
# One factor is load avg for 1 minute, add some slop to the delay for bots.
compressout.write_h('Retry-After: 90\n')
compressout.write_h('\n')
compressout.write_b(open(html503file).read())
def index_page():
'''https://oskog97.com/read/'''
# Handle 304s.
ETag = '"{}{}{}"'.format(
'x'*('application/xhtml+xml' in html_page),
'z'*('gzip' in os.getenv('HTTP_ACCEPT_ENCODING', '')),
os.stat('index.py').st_mtime,
)
compressout.write_h('Vary: If-None-Match\n')
compressout.write_h('ETag: {}\n'.format(ETag))
compressout.write_h(html_page)
if os.getenv('HTTP_IF_NONE_MATCH') == ETag:
compressout.write_h('Status: 304\n\n')
return
compressout.write_h('\n')
if os.getenv('REQUEST_METHOD') == 'HEAD':
return
# Write out a static page.
compressout.write_b('''__HTML5__
<!-- With canonical link tag. -->
<link rel="stylesheet" type="text/css" href="/read/style.css"/>
<meta name="description" content="Interested in the scripts I have
on my website? Come and take a look at them."/>
__TITLE__
</head>
<body>
__NAVIGATION__
<main><div id="content">
__H1__
''')
compressout.write_b('''
<p>
Interested in the scripts I have on my website?
Go take a look at them; start crawling the
<a href="{0}?path=/">root directory</a> or take a look
at the <span class="a"><a href="{0}?sitemap=html"
>(sub)sitemap</a>.</span>
</p>
<div id="syntax">
<h2>Parameter syntax</h2>
<p>
Descriptions for the parameters can be found in
the request forms.
</p>
<ul>
<li>
Asterisks <q>*</q> represent a value that can be
(almost) anything.
</li>
<li>Square brackets <q>[]</q> represent optional.</li>
<li>Curly brackets <q>{}</q> represent mandatory.</li>
<li>Pipes <q>|</q> represent either or.</li>
</ul>
<p>There are three acceptable "sets" of parameters:</p>
<ol>
<li><pre>{0}?sitemap={html|xml}</pre></li>
<li><pre>{0}?path=*[&download=yes]</pre></li>
<li><pre>{0}?path=*[&referer=*[&title=*]]</pre></li>
</ol>
<p>
The order of the valid parameters doesn't matter, but
this is the recommended/canonical order.
</p>
</div>
<div id="forms">
<h2>Request forms</h2>
<p><strong>
Notice that these are three different forms.
</strong></p>
<form action="{0}" method="get">
<h3>Sitemap</h3>
<p>
The <code>sitemap</code> parameter can be either
<q><code>html</code></q>, <q><code>xml</code></q>
or the default <q><code>none</code></q>.
It can't be used together with any other parameters.
</p>
<p>
<input type="radio" name="sitemap" value="html"/>
Request an HTML sitemap instead of a page<br/>
<input type="radio" name="sitemap" value="xml"/>
request an XML sitemap instead of a page<br/>
<input type="submit"/>
</p>
</form>
<form action="{0}" method="get">
<h3>Page</h3>
<p>
A page (source code of a CGI script) is selected with the
<code>path</code> parameter. The value of the
<code>path</code> parameter is a URL relative to this
site, ie. an URL beginning with a single slash.
</p>
<p>
The <code>path</code> is the site-local URL to the CGI
script or directory you're interested in. If you set the
value to <q><code>/read/index.py</code></q>, you'll get the
source code for this script. And if you set it to
<q><code>/</code></q>, you'll get a directory listing
of the site's root directory.
</p>
<p>
Path/URL: <input type="text" name="path" value="/"/>
<input type="submit"/><br/>
<input type="checkbox" name="download" value="yes"/>
Download / see it as plain text
</p>
<p>
The <code>download</code> parameter can be set to either
<q><code>yes</code></q> or the default
<q><code>no</code></q>. The download option does
obviously not work with directories.
</p>
</form>
<form action="{0}" method="get">
<h3>Link back to a referencing page</h3>
<p>
If <code>download</code> is <q><code>no</code></q> or
unset and a page (not a sitemap) was requested, it is
possible to change the navigation to make the requested
page link back to a referring page.
</p>
<p>
The <code>referer</code> (yes, misspelled like the HTTP
Referer) parameter is the URL of the referencing page.
(Don't try to specify a site that isn't mine.)
The <code>title</code> parameter gives the back link a
different text than <q>Back</q>.
</p>
<table>
<tr>
<th><code>path</code></th>
<td><input type="text" name="path" value="/"/></td>
</tr>
<tr>
<th><code>referer</code></th>
<td><input type="text" name="referer"/></td>
</tr>
<tr>
<th><code>title</code></th>
<td><input type="text" name="title"/></td>
</tr>
<tr>
<td></td>
<td><input type="submit"/></td>
</tr>
</table>
</form>
</div>
</div></main>
'''.format(my_url))
compressout.write_b('''
__FOOTER__
</body>
</html>
''')
def noindex(path):
'''
Returns True if `path` should be noindexed.
`path` is an absolute **filesystem** path.
'''
def isword(w):
letters = string.ascii_letters + ',.'
for ch in w:
if w not in letters:
return False
return True
# 1. White list
# 2. Black list
# 3. Page quality (not applicable for directories)
# Check whitelist first.
for regex in conf['doindex']:
if re.match(regex, path[rootlen:]) is not None:
return False
break
# Blacklist (two kinds):
# - Generated from another file.
# - Explicitly blacklisted in 'read.cfg'.
for match, replace in conf['madefrom']:
if re.match(match, path[rootlen:]) is not None:
try:
os.stat(root + re.sub(match, replace, path[rootlen:]))
return True
except:
pass
for regex in conf['noindex'] + conf['hide']:
if re.match(regex, path[rootlen:]) is not None:
return True
# Quality:
# - Text file
# - At least 3072 Unicode code points
# - At least 300 words
# - At least 60 lines
# - Half the limitations if a meta description and title is found
# - A third of the limimitations if an onpage description is found
try:
os.listdir(path)
return False
except:
pass
# Normal file.
try:
if sys.version_info[0] > 2:
text = open(path).read()
else:
text = open(path).read().decode('utf-8')
except:
return True
min_chars, min_words, min_lines, min_comments = 3072, 300, 60, 24
quality = mk_description(path)[0] + 1
min_chars //= quality; min_words //= quality
min_lines //= quality; min_comments //= quality
if len(text) < min_chars:
return True
if text.count('\n') + 1 < min_lines:
return True
n_comments = 0
is_comment = re.compile('^(.*#.*| *\\* .*|.*<!--.*|.*\'\'\'.*)$')
for line in text.split('\n'):
if re.match(is_comment, line) is not None:
n_comments += 1
if n_comments < min_comments:
return True
if len(list(filter(isword, text.replace('\n', ' ').split(' ')))) < min_words:
return True
# Passed the quality tests:
return False
def mk_navigation(referer, title):
'''
Returns a string which is the navigation bar's HTML.
`title` is the title of the requested page.
`referer` is used to **optionally** ``integrate`` a page.
`referer` is a tuple of (URL, title) for the "back" link.
'''
if referer[0]:
return htmlescape.escape('''<!-- Navigation generated by CGIread. -->
<nav><div id="navigation"><div id="nav_inner">
<p><a href="#content" class="textonly">Skip navigation</a></p>
<p class="row">
<span class="textonly" translate="no">[</span><a class="head" href="{URL}">{title}</a><span class="textonly" translate="no">]</span>
>>
<span class="textonly" translate="no">]</span><span class="sub active">{me}</span><span class="textonly" translate="no">[</span>
<span class="textonly" translate="no">[</span><a class="sub" href="{my_url}?sitemap=html">Sitemap for website's scripts</a><span class="textonly" translate="no">]</span>
</p>
<p class="row">
<span class="textonly" translate="no">[</span><a class="head" href="/">Home</a><span class="textonly" translate="no">]</span>
>>
<span class="textonly" translate="no">[</span><a class="sub" href="/read/">Website's scripts</a><span class="textonly" translate="no">]</span>
<span class="textonly" translate="no">[</span><a class="sub" href="/pages/policy.html">Privacy policy & terms of use</a><span class="textonly" translate="no">]</span>
<span class="textonly" translate="no">[</span><a class="sub" href="/sitemap.py">Sitemap</a><span class="textonly" translate="no">]</span>
</p>
<hr class="textonly"/>
</div></div></nav>
<!-- End of navigation. -->''',
URL=(2, referer[0]),
title=(1, referer[1]),
me=(1, title),
my_url=(0, my_url),
)
else:
return '''__NAVIGATION__'''
def mk_referer_param(referer):
'''Returns one of:
''
'&referer=' + referer[0]
'&referer=' + referer[0] + '&title=' + referer[1]
to be added to links from the requested page.
`referer` is used to **optionally** ``integrate`` a page.
See `mk_navigation`
'''
if referer[0]:
if referer[1] != 'Back':
title = '&title={}'.format(referer[1])
else:
title = ''
return '&referer={}{}'.format(referer[0], title)
else:
return ''
def mk_description(path):
'''
Return three strings: (good, title, meta_description, onpage_description)
`path` is the absolute filesystem path to the requested page.
`good` is
0 no title and description
1 title and meta description only
2 also an onpage description
`title` is the title of the page.
`meta_description` is the content of the description meta tag.
`onpage_description` is HTML content for the onpage description.
requested page.
'''
good = 0
title = "source code of {}".format(path[rootlen:])
meta_description = ''
onpage_description = None
try:
content = open(path + '.info').read().split('\n')
good = 1
except:
pass
if good:
title = content[0]
try:
sep = content.index('.')
except ValueError:
sep = None
if sep is not None:
good = 2
meta_description = '\n'.join(content[1:sep])
onpage_description = '\n'.join(content[sep+1:])
else:
meta_description = '\n'.join(content[1:])
if onpage_description is None:
onpage_description = htmlescape.escape('<p>{}</p>',1,meta_description)
return good, title, meta_description, onpage_description
def sitemap(sitemap_type):
'''
Write out an XML or HTML sitemap.
sitemap_type in ('xml', 'html')
The XML sitemap will exclude entries from `conf['noxmlsitemap']`.
'''
if os.getenv('REQUEST_METHOD') != 'HEAD': # NOTICE
# Prevent over-revving the server.
# HEAD requests are basically no-ops.
maxload = conf['sitemap-maxload']
if os.getloadavg()[0] > maxload['load-avg1']:
status503()
return
try:
access_times = list(map(
float, open('read.throttlecontrol').read().strip().split(':')
))
except:
access_times = [0]
if time.time() - access_times[-1] < maxload['throttle-time']:
status503()
return
access_times.insert(0, time.time())
access_times = access_times[:maxload['throttle-requests']]
f = open('read.throttlecontrol', 'w')
f.write(':'.join(list(map(str, access_times))) + '\n')
f.close()
# Write headers before doing anything else.
# A HEAD request doesn't need to know the length (it's TE chunked).
if sitemap_type == 'xml':
compressout.write_h('Content-Type: application/xml; charset=UTF-8\n')
compressout.write_h(
'Link: <{my_url}?sitemap=html>'.format(my_url=canonical_url) +
'; rel="canonical"' +
'; type="text/html"\n'
)
compressout.write_h('X-Robots-Tag: noindex\n\n') # NOTE: last.
elif sitemap_type == 'html':
compressout.write_h(html_page)
compressout.write_h('\n')
else:
assert False, "Neither 'xml' nor 'html'"
if os.getenv('REQUEST_METHOD') == 'HEAD': # NOTICE
return
# Find the pages worth being in the sitemap.
no_access = conf['noaccess'] + conf['hide'] + conf['topsecret']
paths = []
for basedir, dirs, files in os.walk(root, topdown=True):
# Exclude hidden directories:
remove_list = []
sys.stderr.write('In {}\n'.format(basedir))
sys.stderr.write('Dirs: {}\n'.format(repr(dirs)))
for dirname in dirs:
dirpath = os.path.join(basedir, dirname)[rootlen:]
for regex in no_access:
if re.match(regex, dirpath) is not None:
#dirs.remove(dirname)
# BUG: The for loop will skip items in the list if
# other items are removed while looping.
# This caused some real' nasty stuff like sshin to
# be crawled, took a whopping .65 seconds.
remove_list.append(dirname)
break
sys.stderr.write('Removed dirs: {}\n'.format(repr(remove_list)))
for dirname in remove_list:
dirs.remove(dirname)
# Iterate over files:
for filename in files:
filepath = os.path.join(basedir, filename)
# No symlinks allowed.
#if os.stat(filepath).st_mode == os.lstat(filepath).st_mode:
if not os.path.islink(filepath):
#try:
description = mk_description(filepath)
if description[0]:
# Only indexable content allowed.
if not noindex(filepath):
paths.append((filepath[rootlen:], description[3]))
else:
sys.stderr.write('{} is noindexed\n'.format(filepath))
else:
sys.stderr.write('{} has no description\n'.format(filepath))
#except IOError as error:
#assert error.errno in (
#errno.EISDIR, errno.EACCES
#), error.errno
else:
sys.stderr.write('{} is link\n'.format(filepath))
paths.sort(key=lambda x: x[0])
# Print the body.
if sitemap_type == 'xml':
compressout.write_b('''<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
''')
#
for path, description in paths:
# Loop through all the regexes:
for regex in conf['noxmlsitemap']:
if re.match(regex, path) is not None:
break
else:
compressout.write_b(htmlescape.escape('''<url>
<loc>{canonical_url}?path={path}</loc>
<priority>0.5</priority>
''',
canonical_url=(0, canonical_url),
path=(1, path),
))
mod_sitemap.lastmod_changefreq(
root + path,
compressout,
)
compressout.write_b('</url>\n')
#
compressout.write_b('</urlset>\n')
elif sitemap_type == 'html':
compressout.write_b('''__HTML5NC__
<link rel="canonical" href="{canonical_url}?sitemap=html"/>
<link rel="alternate" href="{canonical_url}?sitemap=xml"
type="application/xml"/>
<meta name="robots" content="noindex, follow"/>
<title>Sitemap for scripts' source code</title>
<meta name="description" content="
Sitemap of all scripts available through /read/.
"/>
</head>
<body>
__NAVIGATION__
<main><div id="content" class="sitemap">
<h1 id="title">Sitemap for scripts' source code</h1>
<p><a href="{my_url}?path=/">Root directory</a></p>
<dl>
'''.format(my_url=my_url, canonical_url=canonical_url))
#
indent = 16 * ' '
for path, description in paths:
compressout.write_b(indent + htmlescape.escape(
'''<dt><a translate="no" href="{my_url}?path={path}">
{path}
</a></dt>\n''',
path=(0, path),
my_url=(0, canonical_url),
))
compressout.write_b(indent +
htmlescape.escape('<dd>{}</dd>\n', 0, description)
)
#
compressout.write_b(''' </dl>
</div></main>
__FOOTER__
</body>
</html>
''')
else:
assert False, "Neither 'xml' nor 'html'"
def ls(path, referer):
'''
'''
compressout.write_h(html_page)
compressout.write_h('\n')
if os.getenv('REQUEST_METHOD') == 'HEAD':
return
compressout.write_b('''__HTML5NC__''')
compressout.write_b(htmlescape.escape('''
<link rel="stylesheet" type="text/css" href="/read/style.css"/>
<title>Index of {name}</title>
<meta name="robots" content="{robots_follow}, noindex"/>
<link rel="canonical" href="{canonical_url}?path={name}"/>
</head>
<body>
{navigation}
<main><div id="content" class="ls">
<h1 id="title">Index of <span translate="no">{name}</span></h1>
<p class="read-nav">
{isroot_commentout_start}
<a href="{my_url}?path={parent_path}{referer_params}">
Parent directory
</a>
{isroot_commentout_end}
<a href="{my_url}?sitemap=html">CGIread sitemap</a>
<a href="{my_url}">Main page</a>
</p>
<table id="ls">
''',
name =(1, path[rootlen:] + '/'),
parent_path =(2, '/'.join(path.split('/')[:-1])[rootlen:]+'/'),
robots_follow =(2, 'no'*noindex(path)+'follow'),
navigation =(0, mk_navigation(
referer,
"Index of "+path[rootlen:]+'/'
)),
referer_params=(2, mk_referer_param(referer)),
my_url=(0, my_url),
canonical_url=(0, canonical_url),
isroot_commentout_start=(0, '<!--'*(path == root)),
isroot_commentout_end=(0, '-->'*(path == root)),
))
no_access = conf['noaccess'] + conf['hide'] + conf['topsecret']
for x in sorted(os.listdir(path)):
full_path = os.path.join(path, x)
forbidden = False
for regex in no_access:
if re.match(regex, full_path[rootlen:]) is not None:
forbidden = True
break
if forbidden:
continue
#url = cgi.escape(full_path, quote=True)
try:
os.listdir(full_path)
is_dir = 1
except:
is_dir = 0
# mobile_desc
# desktop_desc
if is_dir:
mobile_desc = '<span class="yeah">-></span>'
desktop_desc = '<span class="yeah">Directory</span>'
else:
try:
content = open(full_path).read() # This fails on Python 3 !!!
if sys.version_info[0] == 2:
content.decode('UTF-8')
binary = False
except:
binary = True
if binary:
desktop_desc = 'Binary'
mobile_desc = ':-('
else:
good, title, meta_d, onpage_d = mk_description(full_path)
if good == 2:
desktop_desc = htmlescape.escape(
'<span class="thenumberofthebeast">{}</span>',
1, meta_d
)
if noindex(full_path):
mobile_desc = '<span class="yeah">:-)</span>'
else:
mobile_desc = '<span class="thenumberofthebeast">:-D</span>'
elif not noindex(full_path):
mobile_desc = '<span class="yeah">:-)</span>'
if compressout.debug_cookie:
desktop_desc = '<span class="yeah">Text; indexable</span>'
else:
desktop_desc = '<span class="yeah">Text</span>'
else:
mobile_desc = ':-|'
if compressout.debug_cookie:
desktop_desc = 'Boring; unindexable'
else:
desktop_desc = 'Looks boring'
compressout.write_b(
htmlescape.escape(
'''<tr><td class="mobile">{mobile_desc}</td>
<td><a translate="no"
href="{site}?path={path}{referer}">{text}</a></td>
<td class="desktop">{desktop_desc}</td></tr>
''',
site=(0, my_url),
path=(2, full_path[rootlen:] + '/'*is_dir),
referer=(2, mk_referer_param(referer)),
text=(1, x + '/'*is_dir),
mobile_desc=(0, mobile_desc),
desktop_desc=(0, desktop_desc),
)
)
compressout.write_b(''' <!--</p>--></table>
</div></main>
__FOOTER__
</body>
</html>\n''')
def download(path):
if noindex(path):
compressout.write_h('X-Robots-Tag: noindex\n')
else:
compressout.write_h('X-Robots-Tag: index\n') # For verbosity.
try:
content = open(path).read()
if sys.version_info[0] == 2:
content.decode('utf-8')
compressout.write_h('Content-Type: text/plain; charset=UTF-8\n')
compressout.write_h(htmlescape.escape(
'Link: <{}?path={}>',
0, canonical_url,
2, path[rootlen:]
) + '; rel="canonical"; type="text/html"\n'
)
except:
compressout.write_h(htmlescape.escape(
'Link: <{}?path={}>; rel="canonical"\n',
0, canonical_url,
2, path[rootlen:]
)) # No type specified.
if if_none_match(path):
compressout.write_h('\n')
if os.getenv('REQUEST_METHOD') != 'HEAD':
compressout.write_b(content)
def cat(path, referer):
'''
'''
def ol_content(text):
out_lines = []
ids = []
allowed_chars = string.ascii_letters + '_-'
for index, line in enumerate(text.split('\n')):
# Create a "permanent" fragment this line.
this_id = ''
# Find ids in Python and XHTML
for decltype in ('def', 'class'):
if line.strip().startswith(decltype + ' ') and '(' in line:
this_id = line.split(decltype, 1)[1].split('(')[0].strip()
if 'id="' in line:
this_id = line.split('id="')[1].split('"')[0]
# Prevent bad ids.
for ch in this_id:
if ch not in allowed_chars:
this_id = ''
break
if this_id in ids:
this_id = ''
# Create the fragment identifier for the line.
if this_id:
ids.append(this_id)
idline = 'id="content_{}"'.format(this_id)
else:
idline = ''
# Create line
out_lines.append(htmlescape.escape(
' <li id="{}"><pre translate="no" {}>{}</pre></li>\n',
0, index + 1,
0, idline,
1, line,
))
fragment_links = []
for fragment in sorted(ids):
fragment_links.append(
(
'<a class="quick" href="#content_{0}" translate="no"' +
'>{0}</a>\n'
).format(
fragment
)
)
return ''.join(out_lines), ''.join(fragment_links)
try:
content = open(path).read()
if sys.version_info[0] == 2:
content.decode('utf-8')
except:
if noindex(path):
compressout.write_h('X-Robots-Tag: noindex\n')
else:
compressout.write_h('X-Robots-Tag: index\n')
compressout.write_h('\n')
compressout.write_b(content)
return
compressout.write_h(html_page)
compressout.write_h('\n')
if os.getenv('REQUEST_METHOD') == 'HEAD':
return
ignore, title, meta_description, p_description = mk_description(path)
last_modified = time.strftime('%F', time.gmtime(os.stat(path).st_mtime))
lines, fragment_links = ol_content(content)
if not fragment_links:
fragment_links = '(none)'
compressout.write_b('''__HTML5NC__''')
compressout.write_b('''
<script type="application/ld+json">
{
"@context":
{
"@vocab": "http://schema.org/"
},
"@type": "SoftwareSourceCode",
"license": "https://opensource.org/licenses/BSD-2-Clause",
"author":
{
''')
compressout.write_b('''
"@type": "Person",
"@id": "__SITE__/",
"name": "{0}",
"url": "__SITE__/"
'''.format(owner))
compressout.write_b('''
},
"publisher": {"@id": "__SITE__/"},
"copyrightHolder": {"@id": "__SITE__/"},
''')
compressout.write_b('''
"url": "{}#code",
"DateModified": "{}"
'''.format(
canonical_url + '?path=' + path[rootlen:],
last_modified,
))
compressout.write_b('''
}
</script>
''')
parent_link = '/'.join(path.split('/')[:-1])[rootlen:]+'/'
compressout.write_b(htmlescape.escape('''
<link rel="stylesheet" type="text/css" href="/read/style.css"/>
<title>{title}</title>
<link rel="canonical" href="{canonical}"/>
<link
rel="alternate"
href="{canonical}&download=yes"
type="text/plain"
/>
<meta name="robots" content="{noindex_no}index"/>
<meta name="description" content="{meta_description}"/>
</head>
<body>
{navigation}
<main><div id="content">
<h1 id="title" translate="no">{title}</h1>
<div id="description">
{content_description}
</div>
<table>
<tr>
<td>Last modified</td>
<td><time datetime="{last_modified}">{last_modified}</time></td>
</tr>
<tr>
<td>Lines</td>
<td>{linecount}</td>
</tr>
{begin_debug}<tr>
<td>Indexable</td>
<td>{indexable}</td>
</tr>{end_debug}
</table>
<p class="notprint read-nav">
<a href="{my_url}?path={parent_dir}">Parent directory</a>
<a href="{my_url}?path={path}&download=yes" target="_blank">Download</a>
<a href="{my_url}?sitemap=html">CGIread sitemap</a>
<a href="{my_url}">Main page</a>
</p>
<p class="notprint">
Quick links:\n{fragments}
</p>
<ol id="code">
{content}
</ol>
</div></main>
''',
title=(2, title),
content=(0, lines),
parent_dir=(2, parent_link + mk_referer_param(referer)),
navigation=(0, mk_navigation(referer, path[rootlen:])),
canonical=(2, canonical_url + '?path=' + path[rootlen:]),
path=(2, path[rootlen:]),
noindex_no=(2, 'no' * noindex(path)),
meta_description=(2, meta_description),
content_description=(0, p_description),
last_modified=(2, last_modified),
linecount=(1, content.count('\n') + 1),
indexable=(0, {True: 'No', False: 'Yes'}[noindex(path)]),
fragments=(0, fragment_links),
my_url=(0, my_url),
begin_debug=(0,['<!--',''][compressout.debug_cookie]),
end_debug=(0,['-->',''][compressout.debug_cookie]),
))
compressout.write_b('''
__FOOTER__
</body>
</html>
''')
def if_none_match(path):
'''
ETag handling for `cat`, `ls` and `download`:
Returns `True` if content needs to be generated.
Outputs necessary headers and 304 statuses.
'''
try:
meta_time = os.stat(path + '.info').st_mtime
except:
meta_time = 0
if sys.version_info[0] > 2:
query_string = os.getenv('QUERY_STRING', '').encode('utf-8')
else:
query_string = os.getenv('QUERY_STRING', '')
ETag = '"{}{}-{}({})-{}-({}-{})"'.format(
'x'*('application/xhtml+xml' in html_page),
'z'*('gzip' in os.getenv('HTTP_ACCEPT_ENCODING', '')),
os.stat(path).st_mtime,
meta_time,
base64.b64encode(query_string),
os.stat('index.py').st_mtime,
os.stat('read.cfg').st_mtime,
)
compressout.write_h('Vary: If-None-Match\n')
compressout.write_h('ETag: {}\n'.format(ETag))
compressout.write_h(
'''X-ETag-Synopsis: [x][z]-<f_time>(<m_time>)-<query>-(<s_time>-<c_time>)
X-ETag-Description-x: "Client accepts application/xhtml+xml"
X-ETag-Description-z: "Content-Encoding: gzip"
X-ETag-Description-f_time: "Unix last modified time for the requested file"
X-ETag-Description-m_time: "Unix last modified time for the file's metadata"
X-ETag-Description-query: "base64 encoded $QUERY_STRING"
X-ETag-Description-s_time: "Unix last modified time for '/read/index.py'"
X-ETag-Description-c_time: "Unix last modified time for '/read/read.cfg'"
''')
if os.getenv('HTTP_IF_NONE_MATCH', '') == ETag:
compressout.write_h('Status: 304\n\n')
return False
else:
return True
def is_injection_attempt(path_param, referer_URI, referer_title):
'''
Various checks to see if any form of injection attempt has been
made. This function checks the `path`, `referer` and `title`
parameters.
Returns True if the request is an injection attempt.
- XSS
- URL injection
- Spam injection
- Restricted files access
'''
# If the path parameter contains an XSS attempt, it can't be corrected
evil = False
# Prevent attacks.
if '..' in path_param:
return True
for var in referer_URI, referer_title:
for ch in var:
if ord(ch) < 32:
return True
if ch in '<>&\'"':
return True
# NOTICE: The following will limit parameters to ASCII.
if ord(ch) > 126:
return True
# Prevent linking to Mallory.
for start in ('http://', 'https://', '//', 'ftp://'):
if referer_URI.startswith(start):
hostname = referer_URI.split('//')[1].split('/')[0]
if hostname not in conf['allowed-referer-hosts']:
return True
else:
break
else:
if ':' in referer_URI:
return True
# Prevent injected spam
if spammy.spammy(referer_title) or len(referer_title) > 42:
return True
# No match.
return False
def handle_injection_attempt(path_param, referer_URI, referer_title):
'''
Decide if the injection attempt was due to innocently following
a malicious link or due to creating one.
'''
# Check if the URL can be sanitized.
if is_injection_attempt(path_param, '', ''):
destination = 'https://en.wikipedia.org/wiki/Data_validation'
else:
destination = my_url + '?path=' + path_param
redirect_spam(destination)
def main():
'''
`compressout.init` MUST be called before `main`
and `compressout.done` after.
'''
# HTML vs XHTML
global html_page
html_page = 'Vary: Accept\n'
if 'application/xhtml+xml' in os.getenv('HTTP_ACCEPT', ''):
html_page += 'Content-Type: application/xhtml+xml; charset=UTF-8\n'
else:
html_page += 'Content-Type: text/html; charset=UTF-8\n'
# Check that the method is either GET, HEAD or OPTIONS.
if os.getenv('REQUEST_METHOD') not in ('GET', 'HEAD'):
if os.getenv('REQUEST_METHOD') != 'OPTIONS':
compressout.write_h('Status: 405\n')
compressout.write_h('Allow: GET, HEAD, OPTIONS\n')
compressout.write_h('Content-Type: text/plain\n')
compressout.write_h('\n')
if os.getenv('REQUEST_METHOD') != 'OPTIONS':
compressout.write_b('Method not allowed!\n')
compressout.write_b('Allowed methods: GET, HEAD, OPTIONS\n')
return
# Get the parameters.
params = cgi.FieldStorage()
path = path_param = params.getfirst('path', default='')
referer_URI = params.getfirst('referer', default='')
referer_title = params.getfirst('title', default='Back')
referer = (referer_URI, referer_title)
download_flag = params.getfirst('download', default='no')
sitemap_param = params.getfirst('sitemap', default='none')
if not os.getenv('QUERY_STRING'):
index_page()
return
# Bad request, but will match the evil patterns.
# Keep it before the evil stopper.
if bool(path_param) and not path_param.startswith('/'):
status400('`path` is not relative to this site. (No leading slash.)')
return
# Do not allow evil requests.
allow = True
# Keep things within the server root.
try:
path = os.path.realpath(root + path)
except:
allow = False
if path != root and not path.startswith(root + '/'):
allow = False
# Stop at forbidden paths. #1/2
for regex in conf['noaccess']:
if re.match(regex, path[rootlen:]) is not None:
allow = False
# Prevent XSS, URL injection, spam injection and miscellaneous assholery.
if is_injection_attempt(path_param, referer_URI, referer_title):
allow = False
if not allow:
handle_injection_attempt(path_param, referer_URI, referer_title)
return
# Bad requests:
if download_flag not in ('yes', 'no'):
status400('`download` MUST be "yes", "no" or unset.')
return
if bool(path_param) and sitemap_param != 'none':
status400('The `sitemap` parameter cannot be used with any other.')
return
if download_flag == 'yes' and bool(referer_URI):
status400("`download=yes` can't be used with the `referer` parameter.")
return
if sitemap_param not in ('none', 'xml', 'html'):
status400('`sitemap` MUST be "html", "xml" or unset.')
return
if download_flag == 'yes' and not bool(path_param):
status400('Nothing to `download`. Use the `path` parameter.')
return
if bool(referer_URI) and not bool(path_param):
status400('`referer` cannot be used without `path`')
return
if referer_title != 'Back' and not bool(referer_URI):
status400('`referer` is not set.')
return
if allow:
# Generate sitemap?
if sitemap_param != 'none':
sitemap(sitemap_param)
else:
# Stop at forbidden paths. #2/2
for regex in conf['topsecret']:
if re.match(regex, path[rootlen:]) is not None:
status404()
break
else:
# Allowed to be seen.
try:
os.listdir(path)
if download_flag == 'no':
if if_none_match(path):
ls(path, referer)
else:
status400("Can't download a directory.")
except OSError as e:
if e.errno == errno.ENOTDIR:
if download_flag == 'no':
if if_none_match(path):
cat(path, referer)
else:
# `download` sets a few headers.
download(path)
elif e.errno == errno.ENOENT:
status404()
else:
raise ValueError(
'errno must be either ENOTDIR or ENOENT'
)
if __name__ == '__main__':
compressout.init()
main()
compressout.done()