400 - Bad Request
{}
Your request can't be understood. Check the parameters.
#!/usr/bin/python3 # -*- coding: utf-8 -*- root = '/var/www' owner = 'Oskar Skog' my_url = '/read/' canonical_url = 'https://oskog97.com/read/' html403file = '/var/www/oops/403.html' html404file = '/var/www/oops/404.html' html503file = '/var/www/oops/cgi503.html' import sys sys.path.append(root) import cgi import os import errno import compressout import base64 import re import time import htmlescape import string import spammy import sitemap as mod_sitemap # Name conflict with already existing function. import cgitb cgitb.enable() rootlen = len(root) #html_mime = 'text/html' # Set to XHTML later. html_page = 'Content-Type: text/html; charset=UTF-8\n' # Set to XHTML later. conf = eval(open('read.cfg').read()) def redirect_spam(destination): '''`destination` is the URL to which assholes should be redirected.''' compressout.write_h('Status: 303\n') compressout.write_h('Location: {}\n'.format(destination)) compressout.write_h('\n') def status400(message): '''HTTP 400; `message` goes UNESCAPED inside a
element.''' compressout.write_h('Status: 400\n') compressout.write_h(html_page) compressout.write_h('\n') compressout.write_b('''400 - Bad Request '''.format(message)) compressout.write_b(''' ''') def status403(): '''HTTP 403''' compressout.write_h(html_page) compressout.write_h('Status: 403\n\n') compressout.write_b(open(html403file).read()) def status404(): '''HTTP 404''' compressout.write_h('Status: 404\n') compressout.write_h(html_page) compressout.write_h('\n') compressout.write_b(open(html404file).read()) def status503(): ''' HTTP 503 Call this if there is too much load on the server to do something. (Used by the sitemap function.) ''' compressout.write_h('Status: 503\n') compressout.write_h(html_page) # One factor is load avg for 1 minute, add some slop to the delay for bots. compressout.write_h('Retry-After: 90\n') compressout.write_h('\n') compressout.write_b(open(html503file).read()) def index_page(): '''https://oskog97.com/read/''' # Handle 304s. ETag = '"{}{}{}"'.format( 'x'*('application/xhtml+xml' in html_page), 'z'*('gzip' in os.getenv('HTTP_ACCEPT_ENCODING', '')), os.stat('index.py').st_mtime, ) compressout.write_h('Vary: If-None-Match\n') compressout.write_h('ETag: {}\n'.format(ETag)) compressout.write_h(html_page) if os.getenv('HTTP_IF_NONE_MATCH') == ETag: compressout.write_h('Status: 304\n\n') return compressout.write_h('\n') if os.getenv('REQUEST_METHOD') == 'HEAD': return # Write out a static page. compressout.write_b(''' 400 - Bad Request
{}Your request can't be understood. Check the parameters.
Website's scripts '''.format(my_url)) compressout.write_b(''' ''') def noindex(path): ''' Returns True if `path` should be noindexed. `path` is an absolute **filesystem** path. ''' def isword(w): letters = string.ascii_letters + ',.' for ch in w: if w not in letters: return False return True # 1. White list # 2. Black list # 3. Page quality (not applicable for directories) # Check whitelist first. for regex in conf['doindex']: if re.match(regex, path[rootlen:]) is not None: return False break # Blacklist (two kinds): # - Generated from another file. # - Explicitly blacklisted in 'read.cfg'. for match, replace in conf['madefrom']: if re.match(match, path[rootlen:]) is not None: try: os.stat(root + re.sub(match, replace, path[rootlen:])) return True except: pass for regex in conf['noindex'] + conf['hide']: if re.match(regex, path[rootlen:]) is not None: return True # Quality: # - Text file # - At least 3072 Unicode code points # - At least 300 words # - At least 60 lines # - Half the limitations if a meta description and title is found # - A third of the limimitations if an onpage description is found try: os.listdir(path) return False except: pass # Normal file. try: if sys.version_info[0] > 2: text = open(path).read() else: text = open(path).read().decode('utf-8') except: return True min_chars, min_words, min_lines, min_comments = 3072, 300, 60, 24 quality = mk_description(path)[0] + 1 min_chars //= quality; min_words //= quality min_lines //= quality; min_comments //= quality if len(text) < min_chars: return True if text.count('\n') + 1 < min_lines: return True n_comments = 0 is_comment = re.compile('^(.*#.*| *\\* .*|.* ''', URL=(2, referer[0]), title=(1, referer[1]), me=(1, title), my_url=(0, my_url), ) else: return ''' ''' def mk_referer_param(referer): '''Returns one of: '' '&referer=' + referer[0] '&referer=' + referer[0] + '&title=' + referer[1] to be added to links from the requested page. `referer` is used to **optionally** ``integrate`` a page. See `mk_navigation` ''' if referer[0]: if referer[1] != 'Back': title = '&title={}'.format(referer[1]) else: title = '' return '&referer={}{}'.format(referer[0], title) else: return '' def mk_description(path): ''' Return three strings: (good, title, meta_description, onpage_description) `path` is the absolute filesystem path to the requested page. `good` is 0 no title and description 1 title and meta description only 2 also an onpage description `title` is the title of the page. `meta_description` is the content of the description meta tag. `onpage_description` is HTML content for the onpage description. requested page. ''' good = 0 title = "source code of {}".format(path[rootlen:]) meta_description = '' onpage_description = None try: content = open(path + '.info').read().split('\n') good = 1 except: pass if good: title = content[0] try: sep = content.index('.') except ValueError: sep = None if sep is not None: good = 2 meta_description = '\n'.join(content[1:sep]) onpage_description = '\n'.join(content[sep+1:]) else: meta_description = '\n'.join(content[1:]) if onpage_description is None: onpage_description = htmlescape.escape(' Website's scripts
''') compressout.write_b('''Interested in the scripts I have on my website? Go take a look at them; start crawling the root directory or take a look at the (sub)sitemap.
Parameter syntax
Descriptions for the parameters can be found in the request forms.
- Asterisks
*represent a value that can be (almost) anything.- Square brackets
[]represent optional.- Curly brackets
{}represent mandatory.- Pipes
|represent either or.There are three acceptable "sets" of parameters:
{0}?sitemap={html|xml} {0}?path=*[&download=yes] {0}?path=*[&referer=*[&title=*]]The order of the valid parameters doesn't matter, but this is the recommended/canonical order.
Request forms
Notice that these are three different forms.
{}
',1,meta_description) return good, title, meta_description, onpage_description def sitemap(sitemap_type): ''' Write out an XML or HTML sitemap. sitemap_type in ('xml', 'html') The XML sitemap will exclude entries from `conf['noxmlsitemap']`. ''' if os.getenv('REQUEST_METHOD') != 'HEAD': # NOTICE # Prevent over-revving the server. # HEAD requests are basically no-ops. maxload = conf['sitemap-maxload'] if os.getloadavg()[0] > maxload['load-avg1']: status503() return try: access_times = list(map( float, open('read.throttlecontrol').read().strip().split(':') )) except: access_times = [0] if time.time() - access_times[-1] < maxload['throttle-time']: status503() return access_times.insert(0, time.time()) access_times = access_times[:maxload['throttle-requests']] f = open('read.throttlecontrol', 'w') f.write(':'.join(list(map(str, access_times))) + '\n') f.close() # Write headers before doing anything else. # A HEAD request doesn't need to know the length (it's TE chunked). if sitemap_type == 'xml': compressout.write_h('Content-Type: application/xml; charset=UTF-8\n') compressout.write_h( 'Link: <{my_url}?sitemap=html>'.format(my_url=canonical_url) + '; rel="canonical"' + '; type="text/html"\n' ) compressout.write_h('X-Robots-Tag: noindex\n\n') # NOTE: last. elif sitemap_type == 'html': compressout.write_h(html_page) compressout.write_h('\n') else: assert False, "Neither 'xml' nor 'html'" if os.getenv('REQUEST_METHOD') == 'HEAD': # NOTICE return # Find the pages worth being in the sitemap. no_access = conf['noaccess'] + conf['hide'] + conf['topsecret'] paths = [] for basedir, dirs, files in os.walk(root, topdown=True): # Exclude hidden directories: remove_list = [] sys.stderr.write('In {}\n'.format(basedir)) sys.stderr.write('Dirs: {}\n'.format(repr(dirs))) for dirname in dirs: dirpath = os.path.join(basedir, dirname)[rootlen:] for regex in no_access: if re.match(regex, dirpath) is not None: #dirs.remove(dirname) # BUG: The for loop will skip items in the list if # other items are removed while looping. # This caused some real' nasty stuff like sshin to # be crawled, took a whopping .65 seconds. remove_list.append(dirname) break sys.stderr.write('Removed dirs: {}\n'.format(repr(remove_list))) for dirname in remove_list: dirs.remove(dirname) # Iterate over files: for filename in files: filepath = os.path.join(basedir, filename) # No symlinks allowed. #if os.stat(filepath).st_mode == os.lstat(filepath).st_mode: if not os.path.islink(filepath): #try: description = mk_description(filepath) if description[0]: # Only indexable content allowed. if not noindex(filepath): paths.append((filepath[rootlen:], description[3])) else: sys.stderr.write('{} is noindexed\n'.format(filepath)) else: sys.stderr.write('{} has no description\n'.format(filepath)) #except IOError as error: #assert error.errno in ( #errno.EISDIR, errno.EACCES #), error.errno else: sys.stderr.write('{} is link\n'.format(filepath)) paths.sort(key=lambda x: x[0]) # Print the body. if sitemap_type == 'xml': compressout.write_b('''''') # for path, description in paths: # Loop through all the regexes: for regex in conf['noxmlsitemap']: if re.match(regex, path) is not None: break else: compressout.write_b(htmlescape.escape(''' \n') elif sitemap_type == 'html': compressout.write_b('''\n') # compressout.write_b(' {canonical_url}?path={path} 0.5 ''', canonical_url=(0, canonical_url), path=(1, path), )) mod_sitemap.lastmod_changefreq( root + path, compressout, ) compressout.write_b('Sitemap for scripts' source code ''') else: assert False, "Neither 'xml' nor 'html'" def ls(path, referer): ''' ''' compressout.write_h(html_page) compressout.write_h('\n') if os.getenv('REQUEST_METHOD') == 'HEAD': return compressout.write_b(''' ''') compressout.write_b(htmlescape.escape(''' Sitemap for scripts' source code
'''.format(my_url=my_url, canonical_url=canonical_url)) # indent = 16 * ' ' for path, description in paths: compressout.write_b(indent + htmlescape.escape( '''
- {path}
\n''', path=(0, path), my_url=(0, canonical_url), )) compressout.write_b(indent + htmlescape.escape('- {}
\n', 0, description) ) # compressout.write_b('''Index of {name} {navigation}\n''') def download(path): if noindex(path): compressout.write_h('X-Robots-Tag: noindex\n') else: compressout.write_h('X-Robots-Tag: index\n') # For verbosity. try: content = open(path).read() if sys.version_info[0] == 2: content.decode('utf-8') compressout.write_h('Content-Type: text/plain; charset=UTF-8\n') compressout.write_h(htmlescape.escape( 'Link: <{}?path={}>', 0, canonical_url, 2, path[rootlen:] ) + '; rel="canonical"; type="text/html"\n' ) except: compressout.write_h(htmlescape.escape( 'Link: <{}?path={}>; rel="canonical"\n', 0, canonical_url, 2, path[rootlen:] )) # No type specified. if if_none_match(path): compressout.write_h('\n') if os.getenv('REQUEST_METHOD') != 'HEAD': compressout.write_b(content) def cat(path, referer): ''' ''' def ol_content(text): out_lines = [] ids = [] allowed_chars = string.ascii_letters + '_-' for index, line in enumerate(text.split('\n')): # Create a "permanent" fragment this line. this_id = '' # Find ids in Python and XHTML for decltype in ('def', 'class'): if line.strip().startswith(decltype + ' ') and '(' in line: this_id = line.split(decltype, 1)[1].split('(')[0].strip() if 'id="' in line: this_id = line.split('id="')[1].split('"')[0] # Prevent bad ids. for ch in this_id: if ch not in allowed_chars: this_id = '' break if this_id in ids: this_id = '' # Create the fragment identifier for the line. if this_id: ids.append(this_id) idline = 'id="content_{}"'.format(this_id) else: idline = '' # Create line out_lines.append(htmlescape.escape( ' Index of {name}
{isroot_commentout_start} Parent directory {isroot_commentout_end} CGIread sitemap Main page
''', name =(1, path[rootlen:] + '/'), parent_path =(2, '/'.join(path.split('/')[:-1])[rootlen:]+'/'), robots_follow =(2, 'no'*noindex(path)+'follow'), navigation =(0, mk_navigation( referer, "Index of "+path[rootlen:]+'/' )), referer_params=(2, mk_referer_param(referer)), my_url=(0, my_url), canonical_url=(0, canonical_url), isroot_commentout_start=(0, ''*(path == root)), )) no_access = conf['noaccess'] + conf['hide'] + conf['topsecret'] for x in sorted(os.listdir(path)): full_path = os.path.join(path, x) forbidden = False for regex in no_access: if re.match(regex, full_path[rootlen:]) is not None: forbidden = True break if forbidden: continue #url = cgi.escape(full_path, quote=True) try: os.listdir(full_path) is_dir = 1 except: is_dir = 0 # mobile_desc # desktop_desc if is_dir: mobile_desc = '->' desktop_desc = 'Directory' else: try: content = open(full_path).read() # This fails on Python 3 !!! if sys.version_info[0] == 2: content.decode('UTF-8') binary = False except: binary = True if binary: desktop_desc = 'Binary' mobile_desc = ':-(' else: good, title, meta_d, onpage_d = mk_description(full_path) if good == 2: desktop_desc = htmlescape.escape( '{}', 1, meta_d ) if noindex(full_path): mobile_desc = ':-)' else: mobile_desc = ':-D' elif not noindex(full_path): mobile_desc = ':-)' if compressout.debug_cookie: desktop_desc = 'Text; indexable' else: desktop_desc = 'Text' else: mobile_desc = ':-|' if compressout.debug_cookie: desktop_desc = 'Boring; unindexable' else: desktop_desc = 'Looks boring' compressout.write_b( htmlescape.escape( '''
''', site=(0, my_url), path=(2, full_path[rootlen:] + '/'*is_dir), referer=(2, mk_referer_param(referer)), text=(1, x + '/'*is_dir), mobile_desc=(0, mobile_desc), desktop_desc=(0, desktop_desc), ) ) compressout.write_b(''' {mobile_desc} {text} {desktop_desc}
{}
Last modified | |
Lines | {linecount} |
Indexable | {indexable} |
Parent directory Download CGIread sitemap Main page
Quick links:\n{fragments}