400 - Bad Request
{}
Your request can't be understood. Check the parameters.
#!/usr/bin/python3 # -*- coding: utf-8 -*- root = '/var/www' owner = 'Oskar Skog' my_url = '/read/' canonical_url = 'https://oskog97.com/read/' html403file = '/var/www/oops/403.html' html404file = '/var/www/oops/404.html' html503file = '/var/www/oops/cgi503.html' import sys sys.path.append(root) import cgi import os import errno import compressout import base64 import re import time import htmlescape import string import spammy import sitemap as mod_sitemap # Name conflict with already existing function. import cgitb cgitb.enable() rootlen = len(root) #html_mime = 'text/html' # Set to XHTML later. html_page = 'Content-Type: text/html; charset=UTF-8\n' # Set to XHTML later. conf = eval(open('read.cfg').read()) def redirect_spam(destination): '''`destination` is the URL to which assholes should be redirected.''' compressout.write_h('Status: 303\n') compressout.write_h('Location: {}\n'.format(destination)) compressout.write_h('\n') def status400(message): '''HTTP 400; `message` goes UNESCAPED inside a
element.'''
compressout.write_h('Status: 400\n')
compressout.write_h(html_page)
compressout.write_h('\n')
compressout.write_b('''
400 - Bad Request
400 - Bad Request
{}
Your request can't be understood.
Check the parameters.
'''.format(message))
compressout.write_b('''
''')
def status403():
'''HTTP 403'''
compressout.write_h(html_page)
compressout.write_h('Status: 403\n\n')
compressout.write_b(open(html403file).read())
def status404():
'''HTTP 404'''
compressout.write_h('Status: 404\n')
compressout.write_h(html_page)
compressout.write_h('\n')
compressout.write_b(open(html404file).read())
def status503():
'''
HTTP 503
Call this if there is too much load on the server to do something.
(Used by the sitemap function.)
'''
compressout.write_h('Status: 503\n')
compressout.write_h(html_page)
# One factor is load avg for 1 minute, add some slop to the delay for bots.
compressout.write_h('Retry-After: 90\n')
compressout.write_h('\n')
compressout.write_b(open(html503file).read())
def index_page():
'''https://oskog97.com/read/'''
# Handle 304s.
ETag = '"{}{}{}"'.format(
'x'*('application/xhtml+xml' in html_page),
'z'*('gzip' in os.getenv('HTTP_ACCEPT_ENCODING', '')),
os.stat('index.py').st_mtime,
)
compressout.write_h('Vary: If-None-Match\n')
compressout.write_h('ETag: {}\n'.format(ETag))
compressout.write_h(html_page)
if os.getenv('HTTP_IF_NONE_MATCH') == ETag:
compressout.write_h('Status: 304\n\n')
return
compressout.write_h('\n')
if os.getenv('REQUEST_METHOD') == 'HEAD':
return
# Write out a static page.
compressout.write_b('''
Website's scripts
Website's scripts
''')
compressout.write_b('''
Interested in the scripts I have on my website?
Go take a look at them; start crawling the
root directory or take a look
at the (sub)sitemap.
Parameter syntax
Descriptions for the parameters can be found in
the request forms.
-
Asterisks
*
represent a value that can be
(almost) anything.
- Square brackets
[]
represent optional.
- Curly brackets
{}
represent mandatory.
- Pipes
|
represent either or.
There are three acceptable "sets" of parameters:
{0}?sitemap={html|xml}
{0}?path=*[&download=yes]
{0}?path=*[&referer=*[&title=*]]
The order of the valid parameters doesn't matter, but
this is the recommended/canonical order.
Request forms
Notice that these are three different forms.
'''.format(my_url))
compressout.write_b('''
''')
def noindex(path):
'''
Returns True if `path` should be noindexed.
`path` is an absolute **filesystem** path.
'''
def isword(w):
letters = string.ascii_letters + ',.'
for ch in w:
if w not in letters:
return False
return True
# 1. White list
# 2. Black list
# 3. Page quality (not applicable for directories)
# Check whitelist first.
for regex in conf['doindex']:
if re.match(regex, path[rootlen:]) is not None:
return False
break
# Blacklist (two kinds):
# - Generated from another file.
# - Explicitly blacklisted in 'read.cfg'.
for match, replace in conf['madefrom']:
if re.match(match, path[rootlen:]) is not None:
try:
os.stat(root + re.sub(match, replace, path[rootlen:]))
return True
except:
pass
for regex in conf['noindex'] + conf['hide']:
if re.match(regex, path[rootlen:]) is not None:
return True
# Quality:
# - Text file
# - At least 3072 Unicode code points
# - At least 300 words
# - At least 60 lines
# - Half the limitations if a meta description and title is found
# - A third of the limimitations if an onpage description is found
try:
os.listdir(path)
return False
except:
pass
# Normal file.
try:
if sys.version_info[0] > 2:
text = open(path).read()
else:
text = open(path).read().decode('utf-8')
except:
return True
min_chars, min_words, min_lines, min_comments = 3072, 300, 60, 24
quality = mk_description(path)[0] + 1
min_chars //= quality; min_words //= quality
min_lines //= quality; min_comments //= quality
if len(text) < min_chars:
return True
if text.count('\n') + 1 < min_lines:
return True
n_comments = 0
is_comment = re.compile('^(.*#.*| *\\* .*|.*
''',
URL=(2, referer[0]),
title=(1, referer[1]),
me=(1, title),
my_url=(0, my_url),
)
else:
return '''
'''
def mk_referer_param(referer):
'''Returns one of:
''
'&referer=' + referer[0]
'&referer=' + referer[0] + '&title=' + referer[1]
to be added to links from the requested page.
`referer` is used to **optionally** ``integrate`` a page.
See `mk_navigation`
'''
if referer[0]:
if referer[1] != 'Back':
title = '&title={}'.format(referer[1])
else:
title = ''
return '&referer={}{}'.format(referer[0], title)
else:
return ''
def mk_description(path):
'''
Return three strings: (good, title, meta_description, onpage_description)
`path` is the absolute filesystem path to the requested page.
`good` is
0 no title and description
1 title and meta description only
2 also an onpage description
`title` is the title of the page.
`meta_description` is the content of the description meta tag.
`onpage_description` is HTML content for the onpage description.
requested page.
'''
good = 0
title = "source code of {}".format(path[rootlen:])
meta_description = ''
onpage_description = None
try:
content = open(path + '.info').read().split('\n')
good = 1
except:
pass
if good:
title = content[0]
try:
sep = content.index('.')
except ValueError:
sep = None
if sep is not None:
good = 2
meta_description = '\n'.join(content[1:sep])
onpage_description = '\n'.join(content[sep+1:])
else:
meta_description = '\n'.join(content[1:])
if onpage_description is None:
onpage_description = htmlescape.escape('{}
',1,meta_description)
return good, title, meta_description, onpage_description
def sitemap(sitemap_type):
'''
Write out an XML or HTML sitemap.
sitemap_type in ('xml', 'html')
The XML sitemap will exclude entries from `conf['noxmlsitemap']`.
'''
if os.getenv('REQUEST_METHOD') != 'HEAD': # NOTICE
# Prevent over-revving the server.
# HEAD requests are basically no-ops.
maxload = conf['sitemap-maxload']
if os.getloadavg()[0] > maxload['load-avg1']:
status503()
return
try:
access_times = list(map(
float, open('read.throttlecontrol').read().strip().split(':')
))
except:
access_times = [0]
if time.time() - access_times[-1] < maxload['throttle-time']:
status503()
return
access_times.insert(0, time.time())
access_times = access_times[:maxload['throttle-requests']]
f = open('read.throttlecontrol', 'w')
f.write(':'.join(list(map(str, access_times))) + '\n')
f.close()
# Write headers before doing anything else.
# A HEAD request doesn't need to know the length (it's TE chunked).
if sitemap_type == 'xml':
compressout.write_h('Content-Type: application/xml; charset=UTF-8\n')
compressout.write_h(
'Link: <{my_url}?sitemap=html>'.format(my_url=canonical_url) +
'; rel="canonical"' +
'; type="text/html"\n'
)
compressout.write_h('X-Robots-Tag: noindex\n\n') # NOTE: last.
elif sitemap_type == 'html':
compressout.write_h(html_page)
compressout.write_h('\n')
else:
assert False, "Neither 'xml' nor 'html'"
if os.getenv('REQUEST_METHOD') == 'HEAD': # NOTICE
return
# Find the pages worth being in the sitemap.
no_access = conf['noaccess'] + conf['hide'] + conf['topsecret']
paths = []
for basedir, dirs, files in os.walk(root, topdown=True):
# Exclude hidden directories:
remove_list = []
sys.stderr.write('In {}\n'.format(basedir))
sys.stderr.write('Dirs: {}\n'.format(repr(dirs)))
for dirname in dirs:
dirpath = os.path.join(basedir, dirname)[rootlen:]
for regex in no_access:
if re.match(regex, dirpath) is not None:
#dirs.remove(dirname)
# BUG: The for loop will skip items in the list if
# other items are removed while looping.
# This caused some real' nasty stuff like sshin to
# be crawled, took a whopping .65 seconds.
remove_list.append(dirname)
break
sys.stderr.write('Removed dirs: {}\n'.format(repr(remove_list)))
for dirname in remove_list:
dirs.remove(dirname)
# Iterate over files:
for filename in files:
filepath = os.path.join(basedir, filename)
# No symlinks allowed.
#if os.stat(filepath).st_mode == os.lstat(filepath).st_mode:
if not os.path.islink(filepath):
#try:
description = mk_description(filepath)
if description[0]:
# Only indexable content allowed.
if not noindex(filepath):
paths.append((filepath[rootlen:], description[3]))
else:
sys.stderr.write('{} is noindexed\n'.format(filepath))
else:
sys.stderr.write('{} has no description\n'.format(filepath))
#except IOError as error:
#assert error.errno in (
#errno.EISDIR, errno.EACCES
#), error.errno
else:
sys.stderr.write('{} is link\n'.format(filepath))
paths.sort(key=lambda x: x[0])
# Print the body.
if sitemap_type == 'xml':
compressout.write_b('''
''')
#
for path, description in paths:
# Loop through all the regexes:
for regex in conf['noxmlsitemap']:
if re.match(regex, path) is not None:
break
else:
compressout.write_b(htmlescape.escape('''
{canonical_url}?path={path}
0.5
''',
canonical_url=(0, canonical_url),
path=(1, path),
))
mod_sitemap.lastmod_changefreq(
root + path,
compressout,
)
compressout.write_b(' \n')
#
compressout.write_b(' \n')
elif sitemap_type == 'html':
compressout.write_b('''
Sitemap for scripts' source code
Sitemap for scripts' source code
'''.format(my_url=my_url, canonical_url=canonical_url))
#
indent = 16 * ' '
for path, description in paths:
compressout.write_b(indent + htmlescape.escape(
'''-
{path}
\n''',
path=(0, path),
my_url=(0, canonical_url),
))
compressout.write_b(indent +
htmlescape.escape('- {}
\n', 0, description)
)
#
compressout.write_b('''
''')
else:
assert False, "Neither 'xml' nor 'html'"
def ls(path, referer):
'''
'''
compressout.write_h(html_page)
compressout.write_h('\n')
if os.getenv('REQUEST_METHOD') == 'HEAD':
return
compressout.write_b('''
''')
compressout.write_b(htmlescape.escape('''
Index of {name}
{navigation}
Index of {name}
{isroot_commentout_start}
Parent directory
{isroot_commentout_end}
CGIread sitemap
Main page
''',
name =(1, path[rootlen:] + '/'),
parent_path =(2, '/'.join(path.split('/')[:-1])[rootlen:]+'/'),
robots_follow =(2, 'no'*noindex(path)+'follow'),
navigation =(0, mk_navigation(
referer,
"Index of "+path[rootlen:]+'/'
)),
referer_params=(2, mk_referer_param(referer)),
my_url=(0, my_url),
canonical_url=(0, canonical_url),
isroot_commentout_start=(0, ''*(path == root)),
))
no_access = conf['noaccess'] + conf['hide'] + conf['topsecret']
for x in sorted(os.listdir(path)):
full_path = os.path.join(path, x)
forbidden = False
for regex in no_access:
if re.match(regex, full_path[rootlen:]) is not None:
forbidden = True
break
if forbidden:
continue
#url = cgi.escape(full_path, quote=True)
try:
os.listdir(full_path)
is_dir = 1
except:
is_dir = 0
# mobile_desc
# desktop_desc
if is_dir:
mobile_desc = '->'
desktop_desc = 'Directory'
else:
try:
content = open(full_path).read() # This fails on Python 3 !!!
if sys.version_info[0] == 2:
content.decode('UTF-8')
binary = False
except:
binary = True
if binary:
desktop_desc = 'Binary'
mobile_desc = ':-('
else:
good, title, meta_d, onpage_d = mk_description(full_path)
if good == 2:
desktop_desc = htmlescape.escape(
'{}',
1, meta_d
)
if noindex(full_path):
mobile_desc = ':-)'
else:
mobile_desc = ':-D'
elif not noindex(full_path):
mobile_desc = ':-)'
if compressout.debug_cookie:
desktop_desc = 'Text; indexable'
else:
desktop_desc = 'Text'
else:
mobile_desc = ':-|'
if compressout.debug_cookie:
desktop_desc = 'Boring; unindexable'
else:
desktop_desc = 'Looks boring'
compressout.write_b(
htmlescape.escape(
'''{mobile_desc}
{text}
{desktop_desc}
''',
site=(0, my_url),
path=(2, full_path[rootlen:] + '/'*is_dir),
referer=(2, mk_referer_param(referer)),
text=(1, x + '/'*is_dir),
mobile_desc=(0, mobile_desc),
desktop_desc=(0, desktop_desc),
)
)
compressout.write_b('''
\n''')
def download(path):
if noindex(path):
compressout.write_h('X-Robots-Tag: noindex\n')
else:
compressout.write_h('X-Robots-Tag: index\n') # For verbosity.
try:
content = open(path).read()
if sys.version_info[0] == 2:
content.decode('utf-8')
compressout.write_h('Content-Type: text/plain; charset=UTF-8\n')
compressout.write_h(htmlescape.escape(
'Link: <{}?path={}>',
0, canonical_url,
2, path[rootlen:]
) + '; rel="canonical"; type="text/html"\n'
)
except:
compressout.write_h(htmlescape.escape(
'Link: <{}?path={}>; rel="canonical"\n',
0, canonical_url,
2, path[rootlen:]
)) # No type specified.
if if_none_match(path):
compressout.write_h('\n')
if os.getenv('REQUEST_METHOD') != 'HEAD':
compressout.write_b(content)
def cat(path, referer):
'''
'''
def ol_content(text):
out_lines = []
ids = []
allowed_chars = string.ascii_letters + '_-'
for index, line in enumerate(text.split('\n')):
# Create a "permanent" fragment this line.
this_id = ''
# Find ids in Python and XHTML
for decltype in ('def', 'class'):
if line.strip().startswith(decltype + ' ') and '(' in line:
this_id = line.split(decltype, 1)[1].split('(')[0].strip()
if 'id="' in line:
this_id = line.split('id="')[1].split('"')[0]
# Prevent bad ids.
for ch in this_id:
if ch not in allowed_chars:
this_id = ''
break
if this_id in ids:
this_id = ''
# Create the fragment identifier for the line.
if this_id:
ids.append(this_id)
idline = 'id="content_{}"'.format(this_id)
else:
idline = ''
# Create line
out_lines.append(htmlescape.escape(
' {}| Last modified | |
| Lines | {linecount} |
| Indexable | {indexable} |
Parent directory Download CGIread sitemap Main page
Quick links:\n{fragments}