element.''' compressout.write_h('Status: 400\n') compressout.write_h(html_page) compressout.write_h('\n') compressout.write_b('''__HTML5__ 400 - Bad Request __NAVIGATION__

400 - Bad Request

{}

Your request can't be understood. Check the parameters.

Documentation for the parameters

'''.format(message)) compressout.write_b(''' __FOOTER__ ''') def status403(): '''HTTP 403''' compressout.write_h(html_page) compressout.write_h('Status: 403\n\n') compressout.write_b(open(html403file).read()) def status404(): '''HTTP 404''' compressout.write_h('Status: 404\n') compressout.write_h(html_page) compressout.write_h('\n') compressout.write_b(open(html404file).read()) def status503(): ''' HTTP 503 Call this if there is too much load on the server to do something. (Used by the sitemap function.) ''' compressout.write_h('Status: 503\n') compressout.write_h(html_page) # One factor is load avg for 1 minute, add some slop to the delay for bots. compressout.write_h('Retry-After: 90\n') compressout.write_h('\n') compressout.write_b(open(html503file).read()) def index_page(): '''https://oskog97.com/read/''' # Handle 304s. ETag = '"{}{}{}"'.format( 'x'*('application/xhtml+xml' in html_page), 'z'*('gzip' in os.getenv('HTTP_ACCEPT_ENCODING', '')), os.stat('index.py').st_mtime, ) compressout.write_h('Vary: If-None-Match\n') compressout.write_h('ETag: {}\n'.format(ETag)) compressout.write_h(html_page) if os.getenv('HTTP_IF_NONE_MATCH') == ETag: compressout.write_h('Status: 304\n\n') return compressout.write_h('\n') if os.getenv('REQUEST_METHOD') == 'HEAD': return # Write out a static page. compressout.write_b('''__HTML5__ __TITLE__ __NAVIGATION__

__H1__ ''') compressout.write_b('''

Interested in the scripts I have on my website? Go take a look at them; start crawling the root directory or take a look at the (sub)sitemap.

Parameter syntax

Descriptions for the parameters can be found in the request forms.

Asterisks * represent a value that can be (almost) anything.
Square brackets [] represent optional.
Curly brackets {} represent mandatory.
Pipes | represent either or.

There are three acceptable "sets" of parameters:

```
{0}?sitemap={html|xml}
```
```
{0}?path=*[&download=yes]
```
```
{0}?path=*[&referer=*[&title=*]]
```

The order of the valid parameters doesn't matter, but this is the recommended/canonical order.

Request forms

Notice that these are three different forms.

Page

A page (source code of a CGI script) is selected with the path parameter. The value of the path parameter is a URL relative to this site, ie. an URL beginning with a single slash.

The path is the site-local URL to the CGI script or directory you're interested in. If you set the value to /read/index.py, you'll get the source code for this script. And if you set it to /, you'll get a directory listing of the site's root directory.

Path/URL:
Download / see it as plain text

The download parameter can be set to either yes or the default no. The download option does obviously not work with directories.

Link back to a referencing page

If download is no or unset and a page (not a sitemap) was requested, it is possible to change the navigation to make the requested page link back to a referring page.

The referer (yes, misspelled like the HTTP Referer) parameter is the URL of the referencing page. (Don't try to specify a site that isn't mine.) The title parameter gives the back link a different text than Back.

`path`
`referer`
`title`

'''.format(my_url)) compressout.write_b(''' __FOOTER__ ''') def noindex(path): ''' Returns True if `path` should be noindexed. `path` is an absolute **filesystem** path. ''' def isword(w): letters = string.ascii_letters + ',.' for ch in w: if w not in letters: return False return True # 1. White list # 2. Black list # 3. Page quality (not applicable for directories) # Check whitelist first. for regex in conf['doindex']: if re.match(regex, path[rootlen:]) is not None: return False break # Blacklist (two kinds): # - Generated from another file. # - Explicitly blacklisted in 'read.cfg'. for match, replace in conf['madefrom']: if re.match(match, path[rootlen:]) is not None: try: os.stat(root + re.sub(match, replace, path[rootlen:])) return True except: pass for regex in conf['noindex'] + conf['hide']: if re.match(regex, path[rootlen:]) is not None: return True # Quality: # - Text file # - At least 3072 Unicode code points # - At least 300 words # - At least 60 lines # - Half the limitations if a meta description and title is found # - A third of the limimitations if an onpage description is found try: os.listdir(path) return False except: pass # Normal file. try: if sys.version_info[0] > 2: text = open(path).read() else: text = open(path).read().decode('utf-8') except: return True min_chars, min_words, min_lines, min_comments = 3072, 300, 60, 24 quality = mk_description(path)[0] + 1 min_chars //= quality; min_words //= quality min_lines //= quality; min_comments //= quality if len(text) < min_chars: return True if text.count('\n') + 1 < min_lines: return True n_comments = 0 is_comment = re.compile('^(.*#.*| *\\* .*|.* ''', URL=(2, referer[0]), title=(1, referer[1]), me=(1, title), my_url=(0, my_url), ) else: return '''__NAVIGATION__''' def mk_referer_param(referer): '''Returns one of: '' '&referer=' + referer[0] '&referer=' + referer[0] + '&title=' + referer[1] to be added to links from the requested page. `referer` is used to **optionally** ``integrate`` a page. See `mk_navigation` ''' if referer[0]: if referer[1] != 'Back': title = '&title={}'.format(referer[1]) else: title = '' return '&referer={}{}'.format(referer[0], title) else: return '' def mk_description(path): ''' Return three strings: (good, title, meta_description, onpage_description) `path` is the absolute filesystem path to the requested page. `good` is 0 no title and description 1 title and meta description only 2 also an onpage description `title` is the title of the page. `meta_description` is the content of the description meta tag. `onpage_description` is HTML content for the onpage description. requested page. ''' good = 0 title = "source code of {}".format(path[rootlen:]) meta_description = '' onpage_description = None try: content = open(path + '.info').read().split('\n') good = 1 except: pass if good: title = content[0] try: sep = content.index('.') except ValueError: sep = None if sep is not None: good = 2 meta_description = '\n'.join(content[1:sep]) onpage_description = '\n'.join(content[sep+1:]) else: meta_description = '\n'.join(content[1:]) if onpage_description is None: onpage_description = htmlescape.escape('

{}

',1,meta_description) return good, title, meta_description, onpage_description def sitemap(sitemap_type): ''' Write out an XML or HTML sitemap. sitemap_type in ('xml', 'html') The XML sitemap will exclude entries from `conf['noxmlsitemap']`. ''' if os.getenv('REQUEST_METHOD') != 'HEAD': # NOTICE # Prevent over-revving the server. # HEAD requests are basically no-ops. maxload = conf['sitemap-maxload'] if os.getloadavg()[0] > maxload['load-avg1']: status503() return try: access_times = list(map( float, open('read.throttlecontrol').read().strip().split(':') )) except: access_times = [0] if time.time() - access_times[-1] < maxload['throttle-time']: status503() return access_times.insert(0, time.time()) access_times = access_times[:maxload['throttle-requests']] f = open('read.throttlecontrol', 'w') f.write(':'.join(list(map(str, access_times))) + '\n') f.close() # Write headers before doing anything else. # A HEAD request doesn't need to know the length (it's TE chunked). if sitemap_type == 'xml': compressout.write_h('Content-Type: application/xml; charset=UTF-8\n') compressout.write_h( 'Link: <{my_url}?sitemap=html>'.format(my_url=canonical_url) + '; rel="canonical"' + '; type="text/html"\n' ) compressout.write_h('X-Robots-Tag: noindex\n\n') # NOTE: last. elif sitemap_type == 'html': compressout.write_h(html_page) compressout.write_h('\n') else: assert False, "Neither 'xml' nor 'html'" if os.getenv('REQUEST_METHOD') == 'HEAD': # NOTICE return # Find the pages worth being in the sitemap. no_access = conf['noaccess'] + conf['hide'] + conf['topsecret'] paths = [] for basedir, dirs, files in os.walk(root, topdown=True): # Exclude hidden directories: remove_list = [] sys.stderr.write('In {}\n'.format(basedir)) sys.stderr.write('Dirs: {}\n'.format(repr(dirs))) for dirname in dirs: dirpath = os.path.join(basedir, dirname)[rootlen:] for regex in no_access: if re.match(regex, dirpath) is not None: #dirs.remove(dirname) # BUG: The for loop will skip items in the list if # other items are removed while looping. # This caused some real' nasty stuff like sshin to # be crawled, took a whopping .65 seconds. remove_list.append(dirname) break sys.stderr.write('Removed dirs: {}\n'.format(repr(remove_list))) for dirname in remove_list: dirs.remove(dirname) # Iterate over files: for filename in files: filepath = os.path.join(basedir, filename) # No symlinks allowed. #if os.stat(filepath).st_mode == os.lstat(filepath).st_mode: if not os.path.islink(filepath): #try: description = mk_description(filepath) if description[0]: # Only indexable content allowed. if not noindex(filepath): paths.append((filepath[rootlen:], description[3])) else: sys.stderr.write('{} is noindexed\n'.format(filepath)) else: sys.stderr.write('{} has no description\n'.format(filepath)) #except IOError as error: #assert error.errno in ( #errno.EISDIR, errno.EACCES #), error.errno else: sys.stderr.write('{} is link\n'.format(filepath)) paths.sort(key=lambda x: x[0]) # Print the body. if sitemap_type == 'xml': compressout.write_b(''' ''') # for path, description in paths: # Loop through all the regexes: for regex in conf['noxmlsitemap']: if re.match(regex, path) is not None: break else: compressout.write_b(htmlescape.escape(''' {canonical_url}?path={path} 0.5 ''', canonical_url=(0, canonical_url), path=(1, path), )) mod_sitemap.lastmod_changefreq( root + path, compressout, ) compressout.write_b('\n') # compressout.write_b('\n') elif sitemap_type == 'html': compressout.write_b('''__HTML5NC__ Sitemap for scripts' source code __NAVIGATION__

Sitemap for scripts' source code

Root directory

{path}: {}

__FOOTER__ ''') else: assert False, "Neither 'xml' nor 'html'" def ls(path, referer): ''' ''' compressout.write_h(html_page) compressout.write_h('\n') if os.getenv('REQUEST_METHOD') == 'HEAD': return compressout.write_b('''__HTML5NC__''') compressout.write_b(htmlescape.escape(''' Index of {name} {navigation}

Index of {name}

{isroot_commentout_start} Parent directory {isroot_commentout_end} CGIread sitemap Main page

''', name =(1, path[rootlen:] + '/'), parent_path =(2, '/'.join(path.split('/')[:-1])[rootlen:]+'/'), robots_follow =(2, 'no'*noindex(path)+'follow'), navigation =(0, mk_navigation( referer, "Index of "+path[rootlen:]+'/' )), referer_params=(2, mk_referer_param(referer)), my_url=(0, my_url), canonical_url=(0, canonical_url), isroot_commentout_start=(0, ''*(path == root)), )) no_access = conf['noaccess'] + conf['hide'] + conf['topsecret'] for x in sorted(os.listdir(path)): full_path = os.path.join(path, x) forbidden = False for regex in no_access: if re.match(regex, full_path[rootlen:]) is not None: forbidden = True break if forbidden: continue #url = cgi.escape(full_path, quote=True) try: os.listdir(full_path) is_dir = 1 except: is_dir = 0 # mobile_desc # desktop_desc if is_dir: mobile_desc = '->' desktop_desc = 'Directory' else: try: content = open(full_path).read() # This fails on Python 3 !!! if sys.version_info[0] == 2: content.decode('UTF-8') binary = False except: binary = True if binary: desktop_desc = 'Binary' mobile_desc = ':-(' else: good, title, meta_d, onpage_d = mk_description(full_path) if good == 2: desktop_desc = htmlescape.escape( '{}', 1, meta_d ) if noindex(full_path): mobile_desc = ':-)' else: mobile_desc = ':-D' elif not noindex(full_path): mobile_desc = ':-)' if compressout.debug_cookie: desktop_desc = 'Text; indexable' else: desktop_desc = 'Text' else: mobile_desc = ':-|' if compressout.debug_cookie: desktop_desc = 'Boring; unindexable' else: desktop_desc = 'Looks boring' compressout.write_b( htmlescape.escape( ''' ''', site=(0, my_url), path=(2, full_path[rootlen:] + '/'*is_dir), referer=(2, mk_referer_param(referer)), text=(1, x + '/'*is_dir), mobile_desc=(0, mobile_desc), desktop_desc=(0, desktop_desc), ) ) compressout.write_b('''

{mobile_desc}

{text}

{desktop_desc}

__FOOTER__ \n''') def download(path): if noindex(path): compressout.write_h('X-Robots-Tag: noindex\n') else: compressout.write_h('X-Robots-Tag: index\n') # For verbosity. try: content = open(path).read() if sys.version_info[0] == 2: content.decode('utf-8') compressout.write_h('Content-Type: text/plain; charset=UTF-8\n') compressout.write_h(htmlescape.escape( 'Link: <{}?path={}>', 0, canonical_url, 2, path[rootlen:] ) + '; rel="canonical"; type="text/html"\n' ) except: compressout.write_h(htmlescape.escape( 'Link: <{}?path={}>; rel="canonical"\n', 0, canonical_url, 2, path[rootlen:] )) # No type specified. if if_none_match(path): compressout.write_h('\n') if os.getenv('REQUEST_METHOD') != 'HEAD': compressout.write_b(content) def cat(path, referer): ''' ''' def ol_content(text): out_lines = [] ids = [] allowed_chars = string.ascii_letters + '_-' for index, line in enumerate(text.split('\n')): # Create a "permanent" fragment this line. this_id = '' # Find ids in Python and XHTML for decltype in ('def', 'class'): if line.strip().startswith(decltype + ' ') and '(' in line: this_id = line.split(decltype, 1)[1].split('(')[0].strip() if 'id="' in line: this_id = line.split('id="')[1].split('"')[0] # Prevent bad ids. for ch in this_id: if ch not in allowed_chars: this_id = '' break if this_id in ids: this_id = '' # Create the fragment identifier for the line. if this_id: ids.append(this_id) idline = 'id="content_{}"'.format(this_id) else: idline = '' # Create line out_lines.append(htmlescape.escape( '

{}

\n', 0, index + 1, 0, idline, 1, line, )) fragment_links = [] for fragment in sorted(ids): fragment_links.append( ( '{0}\n' ).format( fragment ) ) return ''.join(out_lines), ''.join(fragment_links) try: content = open(path).read() if sys.version_info[0] == 2: content.decode('utf-8') except: if noindex(path): compressout.write_h('X-Robots-Tag: noindex\n') else: compressout.write_h('X-Robots-Tag: index\n') compressout.write_h('\n') compressout.write_b(content) return compressout.write_h(html_page) compressout.write_h('\n') if os.getenv('REQUEST_METHOD') == 'HEAD': return ignore, title, meta_description, p_description = mk_description(path) last_modified = time.strftime('%F', time.gmtime(os.stat(path).st_mtime)) lines, fragment_links = ol_content(content) if not fragment_links: fragment_links = '(none)' compressout.write_b('''__HTML5NC__''') compressout.write_b(''' ''') parent_link = '/'.join(path.split('/')[:-1])[rootlen:]+'/' compressout.write_b(htmlescape.escape(''' {title} {navigation}

{title}

{content_description}

{begin_debug}{end_debug}

Last modified	{last_modified}
Lines	{linecount}
Indexable	{indexable}

Parent directory Download CGIread sitemap Main page

Quick links:\n{fragments}

{content}

''', title=(2, title), content=(0, lines), parent_dir=(2, parent_link + mk_referer_param(referer)), navigation=(0, mk_navigation(referer, path[rootlen:])), canonical=(2, canonical_url + '?path=' + path[rootlen:]), path=(2, path[rootlen:]), noindex_no=(2, 'no' * noindex(path)), meta_description=(2, meta_description), content_description=(0, p_description), last_modified=(2, last_modified), linecount=(1, content.count('\n') + 1), indexable=(0, {True: 'No', False: 'Yes'}[noindex(path)]), fragments=(0, fragment_links), my_url=(0, my_url), begin_debug=(0,['',''][compressout.debug_cookie]), )) compressout.write_b(''' __FOOTER__ ''') def if_none_match(path): ''' ETag handling for `cat`, `ls` and `download`: Returns `True` if content needs to be generated. Outputs necessary headers and 304 statuses. ''' try: meta_time = os.stat(path + '.info').st_mtime except: meta_time = 0 if sys.version_info[0] > 2: query_string = os.getenv('QUERY_STRING', '').encode('utf-8') else: query_string = os.getenv('QUERY_STRING', '') ETag = '"{}{}-{}({})-{}-({}-{})"'.format( 'x'*('application/xhtml+xml' in html_page), 'z'*('gzip' in os.getenv('HTTP_ACCEPT_ENCODING', '')), os.stat(path).st_mtime, meta_time, base64.b64encode(query_string), os.stat('index.py').st_mtime, os.stat('read.cfg').st_mtime, ) compressout.write_h('Vary: If-None-Match\n') compressout.write_h('ETag: {}\n'.format(ETag)) compressout.write_h( '''X-ETag-Synopsis: [x][z]-()--(-) X-ETag-Description-x: "Client accepts application/xhtml+xml" X-ETag-Description-z: "Content-Encoding: gzip" X-ETag-Description-f_time: "Unix last modified time for the requested file" X-ETag-Description-m_time: "Unix last modified time for the file's metadata" X-ETag-Description-query: "base64 encoded $QUERY_STRING" X-ETag-Description-s_time: "Unix last modified time for '/read/index.py'" X-ETag-Description-c_time: "Unix last modified time for '/read/read.cfg'" ''') if os.getenv('HTTP_IF_NONE_MATCH', '') == ETag: compressout.write_h('Status: 304\n\n') return False else: return True def is_injection_attempt(path_param, referer_URI, referer_title): ''' Various checks to see if any form of injection attempt has been made. This function checks the `path`, `referer` and `title` parameters. Returns True if the request is an injection attempt. - XSS - URL injection - Spam injection - Restricted files access ''' # If the path parameter contains an XSS attempt, it can't be corrected evil = False # Prevent attacks. if '..' in path_param: return True for var in referer_URI, referer_title: for ch in var: if ord(ch) < 32: return True if ch in '<>&\'"': return True # NOTICE: The following will limit parameters to ASCII. if ord(ch) > 126: return True # Prevent linking to Mallory. for start in ('http://', 'https://', '//', 'ftp://'): if referer_URI.startswith(start): hostname = referer_URI.split('//')[1].split('/')[0] if hostname not in conf['allowed-referer-hosts']: return True else: break else: if ':' in referer_URI: return True # Prevent injected spam if spammy.spammy(referer_title) or len(referer_title) > 42: return True # No match. return False def handle_injection_attempt(path_param, referer_URI, referer_title): ''' Decide if the injection attempt was due to innocently following a malicious link or due to creating one. ''' # Check if the URL can be sanitized. if is_injection_attempt(path_param, '', ''): destination = 'https://en.wikipedia.org/wiki/Data_validation' else: destination = my_url + '?path=' + path_param redirect_spam(destination) def main(): ''' `compressout.init` MUST be called before `main` and `compressout.done` after. ''' # HTML vs XHTML global html_page html_page = 'Vary: Accept\n' if 'application/xhtml+xml' in os.getenv('HTTP_ACCEPT', ''): html_page += 'Content-Type: application/xhtml+xml; charset=UTF-8\n' else: html_page += 'Content-Type: text/html; charset=UTF-8\n' # Check that the method is either GET, HEAD or OPTIONS. if os.getenv('REQUEST_METHOD') not in ('GET', 'HEAD'): if os.getenv('REQUEST_METHOD') != 'OPTIONS': compressout.write_h('Status: 405\n') compressout.write_h('Allow: GET, HEAD, OPTIONS\n') compressout.write_h('Content-Type: text/plain\n') compressout.write_h('\n') if os.getenv('REQUEST_METHOD') != 'OPTIONS': compressout.write_b('Method not allowed!\n') compressout.write_b('Allowed methods: GET, HEAD, OPTIONS\n') return # Get the parameters. params = cgi.FieldStorage() path = path_param = params.getfirst('path', default='') referer_URI = params.getfirst('referer', default='') referer_title = params.getfirst('title', default='Back') referer = (referer_URI, referer_title) download_flag = params.getfirst('download', default='no') sitemap_param = params.getfirst('sitemap', default='none') if not os.getenv('QUERY_STRING'): index_page() return # Bad request, but will match the evil patterns. # Keep it before the evil stopper. if bool(path_param) and not path_param.startswith('/'): status400('`path` is not relative to this site. (No leading slash.)') return # Do not allow evil requests. allow = True # Keep things within the server root. try: path = os.path.realpath(root + path) except: allow = False if path != root and not path.startswith(root + '/'): allow = False # Stop at forbidden paths. #1/2 for regex in conf['noaccess']: if re.match(regex, path[rootlen:]) is not None: allow = False # Prevent XSS, URL injection, spam injection and miscellaneous assholery. if is_injection_attempt(path_param, referer_URI, referer_title): allow = False if not allow: handle_injection_attempt(path_param, referer_URI, referer_title) return # Bad requests: if download_flag not in ('yes', 'no'): status400('`download` MUST be "yes", "no" or unset.') return if bool(path_param) and sitemap_param != 'none': status400('The `sitemap` parameter cannot be used with any other.') return if download_flag == 'yes' and bool(referer_URI): status400("`download=yes` can't be used with the `referer` parameter.") return if sitemap_param not in ('none', 'xml', 'html'): status400('`sitemap` MUST be "html", "xml" or unset.') return if download_flag == 'yes' and not bool(path_param): status400('Nothing to `download`. Use the `path` parameter.') return if bool(referer_URI) and not bool(path_param): status400('`referer` cannot be used without `path`') return if referer_title != 'Back' and not bool(referer_URI): status400('`referer` is not set.') return if allow: # Generate sitemap? if sitemap_param != 'none': sitemap(sitemap_param) else: # Stop at forbidden paths. #2/2 for regex in conf['topsecret']: if re.match(regex, path[rootlen:]) is not None: status404() break else: # Allowed to be seen. try: os.listdir(path) if download_flag == 'no': if if_none_match(path): ls(path, referer) else: status400("Can't download a directory.") except OSError as e: if e.errno == errno.ENOTDIR: if download_flag == 'no': if if_none_match(path): cat(path, referer) else: # `download` sets a few headers. download(path) elif e.errno == errno.ENOENT: status404() else: raise ValueError( 'errno must be either ENOTDIR or ENOENT' ) if __name__ == '__main__': compressout.init() main() compressout.done()