CGI source code reader script

This is (the source of) the script that generates this very page.

Through this, you can see the source code for all the scripts on my site.

    Requirements for a file for its /read/ page to be indexable:
  • Always indexable if whitelisted
  • Not manually blacklisted
  • Not made from another file
  • Text file
  • At least 3072/1536/1024 Unicode code points
  • At least 300/150/100 "words"
  • At least 60/30/20 lines
  • At least 24/12/8 comments
Last modified
Lines 1128
Indexable Yes

Parent directory Download CGIread sitemap Main page

Quick links: cat code content description download forms handle_injection_attempt if_none_match index_page is_injection_attempt isword ls main mk_description mk_navigation mk_referer_param navigation noindex ol_content redirect_spam sitemap syntax title

  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. root = '/var/www'
  4. owner = 'Oskar Skog'
  5. my_url = '/read/'
  6. canonical_url = 'https://__HOST__/read/'
  7. html403file = '/var/www/oops/403.html'
  8. html404file = '/var/www/oops/404.html'
  9. html503file = '/var/www/oops/cgi503.html'
  10. import sys
  11. sys.path.append(root)
  12. import cgi
  13. import os
  14. import errno
  15. import compressout
  16. import base64
  17. import re
  18. import time
  19. import htmlescape
  20. import string
  21. import spammy
  22. import sitemap as mod_sitemap  # Name conflict with already existing function.
  23. import cgitb
  24. cgitb.enable()
  25. rootlen = len(root)
  26. #html_mime = 'text/html'      # Set to XHTML later.
  27. html_page = 'Content-Type: text/html; charset=UTF-8\n'  # Set to XHTML later.
  28. conf = eval(open('read.cfg').read())
  29. def redirect_spam(destination):
  30.     '''`destination` is the URL to which assholes should be redirected.'''
  31.     compressout.write_h('Status: 303\n')
  32.     compressout.write_h('Location: {}\n'.format(destination))
  33.     compressout.write_h('\n')
  34. def status400(message):
  35.     '''HTTP 400; `message` goes UNESCAPED inside a <pre> element.'''
  36.     compressout.write_h('Status: 400\n')
  37.     compressout.write_h(html_page)
  38.     compressout.write_h('\n')
  39.     compressout.write_b('''__HTML5__
  40.         <title>400 - Bad Request</title>
  41.     </head>
  42.     <body>
  43.         __NAVIGATION__
  44.         <main><div id="content">
  45.             <h1 id="title">400 - Bad Request</h1>
  46.             <pre>{}</pre>
  47.             <p>
  48.                 Your request can't be understood.
  49.                 Check the parameters.
  50.             </p>
  51.             <p><a href="/read/">Documentation for the parameters</a></p>
  52.         </div></main>
  53. '''.format(message))
  54.     compressout.write_b('''
  55.         __FOOTER__
  56.     </body>
  57. </html>''')
  58. def status403():
  59.     '''HTTP 403'''
  60.     compressout.write_h(html_page)
  61.     compressout.write_h('Status: 403\n\n')
  62.     compressout.write_b(open(html403file).read())
  63. def status404():
  64.     '''HTTP 404'''
  65.     compressout.write_h('Status: 404\n')
  66.     compressout.write_h(html_page)
  67.     compressout.write_h('\n')
  68.     compressout.write_b(open(html404file).read())
  69. def status503():
  70.     '''
  71.     HTTP 503
  72.     
  73.     Call this if there is too much load on the server to do something.
  74.     (Used by the sitemap function.)
  75.     '''
  76.     compressout.write_h('Status: 503\n')
  77.     compressout.write_h(html_page)
  78.     # One factor is load avg for 1 minute, add some slop to the delay for bots.
  79.     compressout.write_h('Retry-After: 90\n')
  80.     compressout.write_h('\n')
  81.     compressout.write_b(open(html503file).read())
  82. def index_page():
  83.     '''https://oskog97.com/read/'''
  84.     # Handle 304s.
  85.     ETag = '"{}{}{}"'.format(
  86.         'x'*('application/xhtml+xml' in html_page),
  87.         'z'*('gzip' in os.getenv('HTTP_ACCEPT_ENCODING', '')),
  88.         os.stat('index.py').st_mtime,
  89.     )
  90.     compressout.write_h('Vary: If-None-Match\n')
  91.     compressout.write_h('ETag: {}\n'.format(ETag))
  92.     compressout.write_h(html_page)
  93.     if os.getenv('HTTP_IF_NONE_MATCH') == ETag:
  94.         compressout.write_h('Status: 304\n\n')
  95.         return
  96.     compressout.write_h('\n')
  97.     if os.getenv('REQUEST_METHOD') == 'HEAD':
  98.         return
  99.     # Write out a static page.
  100.     compressout.write_b('''__HTML5__
  101.     <!-- With canonical link tag. -->
  102.         <link rel="stylesheet" type="text/css" href="/read/style.css"/>
  103.         <meta name="description" content="Interested in the scripts I have
  104.         on my website? Come and take a look at them."/>
  105.         __TITLE__
  106.     </head>
  107.     <body>
  108.         __NAVIGATION__
  109.         <main><div id="content">
  110.             __H1__
  111.     ''')
  112.     compressout.write_b('''
  113.             <p>
  114.                 Interested in the scripts I have on my website?
  115.                 Go take a look at them; start crawling the
  116.                 <a href="{0}?path=/">root directory</a> or take a look
  117.                 at the <span class="a"><a href="{0}?sitemap=html"
  118.                 >(sub)sitemap</a>.</span>
  119.             </p>
  120.             <div id="syntax">
  121.                 <h2>Parameter syntax</h2>
  122.                 <p>
  123.                     Descriptions for the parameters can be found in
  124.                     the request forms.
  125.                 </p>
  126.                 <ul>
  127.                     <li>
  128.                         Asterisks <q>*</q> represent a value that can be
  129.                         (almost) anything.
  130.                     </li>
  131.                     <li>Square brackets <q>[]</q> represent optional.</li>
  132.                     <li>Curly brackets <q>&#x7b;&#x7d;</q> represent mandatory.</li>
  133.                     <li>Pipes <q>|</q> represent either or.</li>
  134.                 </ul>
  135.                 <p>There are three acceptable "sets" of parameters:</p>
  136.                 <ol>
  137. <li><pre>{0}?sitemap=&#x7b;html|xml&#x7d;</pre></li>
  138. <li><pre>{0}?path=*[&amp;download=yes]</pre></li>
  139. <li><pre>{0}?path=*[&amp;referer=*[&amp;title=*]]</pre></li>
  140.                 </ol>
  141.                 <p>
  142.                     The order of the valid parameters doesn't matter, but
  143.                     this is the recommended/canonical order.
  144.                 </p>
  145.             </div>
  146.             <div id="forms">
  147.                 <h2>Request forms</h2>
  148.                 <p><strong>
  149.                     Notice that these are three different forms.
  150.                 </strong></p>
  151.                 <form action="{0}" method="get">
  152.                 <h3>Sitemap</h3>
  153.                 <p>
  154.                     The <code>sitemap</code> parameter can be either
  155.                     <q><code>html</code></q>, <q><code>xml</code></q>
  156.                     or the default <q><code>none</code></q>.
  157.                     It can't be used together with any other parameters.
  158.                 </p>
  159.                 <p>
  160.                     <input type="radio" name="sitemap" value="html"/>
  161.                     Request an HTML sitemap instead of a page<br/>
  162.                     <input type="radio" name="sitemap" value="xml"/>
  163.                     request an XML sitemap instead of a page<br/>
  164.                     <input type="submit"/>
  165.                 </p>
  166.                 </form>
  167.                 <form action="{0}" method="get">
  168.                 <h3>Page</h3>
  169.                 <p>
  170.                     A page (source code of a CGI script) is selected with the
  171.                     <code>path</code> parameter.  The value of the
  172.                     <code>path</code> parameter is a URL relative to this
  173.                     site, ie. an URL beginning with a single slash.
  174.                 </p>
  175.                 <p>
  176.                     The <code>path</code> is the site-local URL to the CGI
  177.                     script or directory you're interested in.  If you set the
  178.                     value to <q><code>/read/index.py</code></q>, you'll get the
  179.                     source code for this script. And if you set it to
  180.                     <q><code>/</code></q>, you'll get a directory listing
  181.                     of the site's root directory.
  182.                 </p>
  183.                 <p>
  184.                     Path/URL: <input type="text" name="path" value="/"/>
  185.                     <input type="submit"/><br/>
  186.                     <input type="checkbox" name="download" value="yes"/>
  187.                     Download / see it as plain text
  188.                     
  189.                 </p>
  190.                 <p>
  191.                     The <code>download</code> parameter can be set to either
  192.                     <q><code>yes</code></q> or the default
  193.                     <q><code>no</code></q>.  The download option does
  194.                     obviously not work with directories.
  195.                 </p>
  196.                 </form>
  197.                 <form action="{0}" method="get">
  198.                 <h3>Link back to a referencing page</h3>
  199.                 <p>
  200.                     If <code>download</code> is <q><code>no</code></q> or
  201.                     unset and a page (not a sitemap) was requested, it is
  202.                     possible to change the navigation to make the requested
  203.                     page link back to a referring page.
  204.                 </p>
  205.                 <p>
  206.                     The <code>referer</code> (yes, misspelled like the HTTP
  207.                     Referer) parameter is the URL of the referencing page.
  208.                     (Don't try to specify a site that isn't mine.)
  209.                     The <code>title</code> parameter gives the back link a
  210.                     different text than <q>Back</q>.
  211.                 </p>
  212.                 <table>
  213.                     <tr>
  214.                         <th><code>path</code></th>
  215.                         <td><input type="text" name="path" value="/"/></td>
  216.                     </tr>
  217.                     <tr>
  218.                         <th><code>referer</code></th>
  219.                         <td><input type="text" name="referer"/></td>
  220.                     </tr>
  221.                     <tr>
  222.                         <th><code>title</code></th>
  223.                         <td><input type="text" name="title"/></td>
  224.                     </tr>
  225.                     <tr>
  226.                         <td></td>
  227.                         <td><input type="submit"/></td>
  228.                     </tr>
  229.                 </table>
  230.                 </form>
  231.             </div>
  232.         </div></main>
  233.     '''.format(my_url))
  234.     compressout.write_b('''
  235.         __FOOTER__
  236.     </body>
  237. </html>
  238. ''')
  239. def noindex(path):
  240.     '''
  241.     Returns True if `path` should be noindexed.
  242.     
  243.     `path` is an absolute **filesystem** path.
  244.     '''
  245.     def isword(w):
  246.         letters = string.letters + ',.'
  247.         for ch in w:
  248.             if w not in letters:
  249.                 return False
  250.         return True
  251.     # 1. White list
  252.     # 2. Black list
  253.     # 3. Page quality (not applicable for directories)
  254.     
  255.     # Check whitelist first.
  256.     for regex in conf['doindex']:
  257.         if re.match(regex, path[rootlen:]) is not None:
  258.             return False
  259.             break
  260.     
  261.     # Blacklist (two kinds):
  262.     # - Generated from another file.
  263.     # - Explicitly blacklisted in 'read.cfg'.
  264.     for match, replace in conf['madefrom']:
  265.         if re.match(match, path[rootlen:]) is not None:
  266.             try:
  267.                 os.stat(root + re.sub(match, replace, path[rootlen:]))
  268.                 return True
  269.             except:
  270.                 pass
  271.     for regex in conf['noindex'] + conf['hide']:
  272.         if re.match(regex, path[rootlen:]) is not None:
  273.             return True
  274.     
  275.     # Quality:
  276.     #   - Text file
  277.     #   - At least 3072 Unicode code points
  278.     #   - At least 300 words
  279.     #   - At least 60 lines
  280.     #   - Half the limitations if a meta description and title is found
  281.     #   - A third of the limimitations if an onpage description is found
  282.     try:
  283.         os.listdir(path)
  284.         return False
  285.     except:
  286.         pass
  287.     # Normal file.
  288.     try:
  289.         text = open(path).read().decode('utf-8')
  290.     except:
  291.         return True
  292.     min_chars, min_words, min_lines, min_comments = 3072, 300, 60, 24
  293.     quality = mk_description(path)[0] + 1
  294.     min_chars //= quality; min_words //= quality
  295.     min_lines //= quality; min_comments //= quality
  296.     if len(text) < min_chars:
  297.         return True
  298.     if text.count('\n') + 1 < min_lines:
  299.         return True
  300.     n_comments = 0
  301.     is_comment = re.compile('^(.*#.*| *\\* .*|.*<!--.*|.*\'\'\'.*)$')
  302.     for line in text.split('\n'):
  303.         if re.match(is_comment, line) is not None:
  304.             n_comments += 1
  305.     if n_comments < min_comments:
  306.         return True
  307.     if len(filter(isword, text.replace('\n', ' ').split(' '))) < min_words:
  308.         return True
  309.     # Passed the quality tests:
  310.     return False
  311. def mk_navigation(referer, title):
  312.     '''
  313.     Returns a string which is the navigation bar's HTML.
  314.     
  315.     `title` is the title of the requested page.
  316.     
  317.     `referer` is used to **optionally** ``integrate`` a page.
  318.     `referer` is a tuple of (URL, title) for the "back" link.
  319.     '''
  320.     if referer[0]:
  321.         return htmlescape.escape('''<!-- Navigation generated by CGIread. -->
  322. <nav><div id="navigation"><div id="nav_inner">
  323. <p><a href="#content" class="textonly">Skip navigation</a></p>
  324. <p class="row">
  325. <span class="textonly" translate="no">[</span><a class="head" href="{URL}">{title}</a><span class="textonly" translate="no">]</span>
  326. &gt;&gt;
  327. <span class="textonly" translate="no">]</span><span class="sub active">{me}</span><span class="textonly" translate="no">[</span>
  328. <span class="textonly" translate="no">[</span><a class="sub" href="{my_url}?sitemap=html">Sitemap for website's scripts</a><span class="textonly" translate="no">]</span>
  329. </p>
  330. <p class="row">
  331. <span class="textonly" translate="no">[</span><a class="head" href="/">Home</a><span class="textonly" translate="no">]</span>
  332. &gt;&gt;
  333. <span class="textonly" translate="no">[</span><a class="sub" href="/read/">Website's scripts</a><span class="textonly" translate="no">]</span>
  334. <span class="textonly" translate="no">[</span><a class="sub" href="/pages/policy.html">Privacy policy &amp; terms of use</a><span class="textonly" translate="no">]</span>
  335. <span class="textonly" translate="no">[</span><a class="sub" href="/sitemap.py">Sitemap</a><span class="textonly" translate="no">]</span>
  336. </p>
  337. <hr class="textonly"/>
  338. </div></div></nav>
  339. <!-- End of navigation. -->''',
  340.             URL=(2, referer[0]),
  341.             title=(1, referer[1]),
  342.             me=(1, title),
  343.             my_url=(0, my_url),
  344.         )
  345.     else:
  346.         return '''__NAVIGATION__'''
  347. def mk_referer_param(referer):
  348.     '''Returns one of:
  349.         ''
  350.         '&referer=' + referer[0]
  351.         '&referer=' + referer[0] + '&title=' + referer[1]
  352.     to be added to links from the requested page.
  353.     
  354.     `referer` is used to **optionally** ``integrate`` a page.
  355.     See `mk_navigation`
  356.     '''
  357.     if referer[0]:
  358.         if referer[1] != 'Back':
  359.             title = '&title={}'.format(referer[1])
  360.         else:
  361.             title = ''
  362.         return '&referer={}{}'.format(referer[0], title)
  363.     else:
  364.         return ''
  365. def mk_description(path):
  366.     '''
  367.     Return three strings: (good, title, meta_description, onpage_description)
  368.     
  369.     `path` is the absolute filesystem path to the requested page.
  370.     
  371.     `good` is
  372.         0       no title and description
  373.         1       title and meta description only
  374.         2       also an onpage description
  375.     
  376.     `title` is the title of the page.
  377.     
  378.     `meta_description` is the content of the description meta tag.
  379.     
  380.     `onpage_description` is HTML content for the onpage description.
  381.     requested page.
  382.     '''
  383.     good = 0
  384.     title = "source code of {}".format(path[rootlen:])
  385.     meta_description = ''
  386.     onpage_description = None
  387.     try:
  388.         content = open(path + '.info').read().split('\n')
  389.         good = 1
  390.     except:
  391.         pass
  392.     if good:
  393.         title = content[0]
  394.         try:
  395.             sep = content.index('.')
  396.         except ValueError:
  397.             sep = None
  398.         if sep is not None:
  399.             good = 2
  400.             meta_description = '\n'.join(content[1:sep])
  401.             onpage_description = '\n'.join(content[sep+1:])
  402.         else:
  403.             meta_description = '\n'.join(content[1:])
  404.     if onpage_description is None:
  405.         onpage_description = htmlescape.escape('<p>{}</p>',1,meta_description)
  406.     return good, title, meta_description, onpage_description
  407. def sitemap(sitemap_type):
  408.     '''
  409.     Write out an XML or HTML sitemap.
  410.     sitemap_type in ('xml', 'html')
  411.     
  412.     The XML sitemap will exclude entries from `conf['noxmlsitemap']`.
  413.     '''    
  414.     
  415.     if os.getenv('REQUEST_METHOD') != 'HEAD': # NOTICE
  416.         # Prevent over-revving the server.
  417.         # HEAD requests are basically no-ops.
  418.         maxload = conf['sitemap-maxload']
  419.         if os.getloadavg()[0] > maxload['load-avg1']:
  420.             status503()
  421.             return
  422.         try:
  423.             access_times = map(
  424.                 float, open('read.throttlecontrol').read().strip().split(':')
  425.             )
  426.         except:
  427.             access_times = [0]
  428.         if time.time() - access_times[-1] < maxload['throttle-time']:
  429.             status503()
  430.             return
  431.         access_times.insert(0, time.time())
  432.         access_times = access_times[:maxload['throttle-requests']]
  433.         f = open('read.throttlecontrol', 'w')
  434.         f.write(':'.join(map(str, access_times)) + '\n')
  435.         f.close()
  436.     # Write headers before doing anything else.
  437.     # A HEAD request doesn't need to know the length (it's TE chunked).
  438.     if sitemap_type == 'xml':
  439.         compressout.write_h('Content-Type: application/xml; charset=UTF-8\n')
  440.         compressout.write_h(
  441.             'Link: <{my_url}?sitemap=html>'.format(my_url=canonical_url) +
  442.             '; rel="canonical"' +
  443.             '; type="text/html"\n'
  444.         )
  445.         compressout.write_h('X-Robots-Tag: noindex\n\n') # NOTE: last.
  446.     elif sitemap_type == 'html':
  447.         compressout.write_h(html_page)
  448.         compressout.write_h('\n')
  449.     else:
  450.         assert False, "Neither 'xml' nor 'html'"
  451.     if os.getenv('REQUEST_METHOD') == 'HEAD': # NOTICE
  452.         return
  453.     
  454.     # Find the pages worth being in the sitemap.
  455.     no_access = conf['noaccess'] + conf['hide'] + conf['topsecret']
  456.     paths = []
  457.     
  458.     for basedir, dirs, files in os.walk(root, topdown=True):
  459.         # Exclude hidden directories:
  460.         remove_list = []
  461.         for dirname in dirs:
  462.             dirpath = os.path.join(basedir, dirname)[rootlen:]
  463.             for regex in no_access:
  464.                 if re.match(regex, dirpath) is not None:
  465.                     #dirs.remove(dirname)
  466.                     # BUG: The for loop will skip items in the list if
  467.                     # other items are removed while looping.
  468.                     # This caused some real' nasty stuff like sshin to
  469.                     # be crawled, took a whopping .65 seconds.
  470.                     remove_list.append(dirname)
  471.                     break
  472.         for dirname in remove_list:
  473.             dirs.remove(dirname)
  474.         
  475.         # Iterate over files:
  476.         for filename in files:
  477.             filepath = os.path.join(basedir, filename)
  478.             # No symlinks allowed.
  479.             if os.stat(filepath).st_mode == os.lstat(filepath).st_mode:
  480.                 #try:
  481.                     description = mk_description(filepath)
  482.                     if description[0]:
  483.                         # Only indexable content allowed.
  484.                         if not noindex(filepath):
  485.                             paths.append((filepath[rootlen:], description[3]))
  486.                 #except IOError as error:
  487.                     #assert error.errno in (
  488.                         #errno.EISDIR, errno.EACCES
  489.                     #), error.errno
  490.     
  491.     paths.sort(key=lambda x: x[0])
  492.     
  493.     # Print the body.
  494.     if sitemap_type == 'xml':
  495.         compressout.write_b('''<?xml version="1.0" encoding="UTF-8"?>
  496. <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  497. ''')
  498.         #
  499.         for path, description in paths:
  500.             # Loop through all the regexes:
  501.             for regex in conf['noxmlsitemap']:
  502.                 if re.match(regex, path) is not None:
  503.                     break
  504.             else:
  505.                 compressout.write_b(htmlescape.escape('''<url>
  506.     <loc>{canonical_url}?path={path}</loc>
  507.     <priority>0.5</priority>
  508. ''',
  509.                     canonical_url=(0, canonical_url),
  510.                     path=(1, path),
  511.                 ))
  512.                 mod_sitemap.lastmod_changefreq(
  513.                     root + path,
  514.                     compressout,
  515.                 )
  516.                 compressout.write_b('</url>\n')
  517.         #
  518.         compressout.write_b('</urlset>\n')
  519.     elif sitemap_type == 'html':
  520.         compressout.write_b('''__HTML5NC__
  521.         <link rel="canonical" href="{canonical_url}?sitemap=html"/>
  522.         <link rel="alternate" href="{canonical_url}?sitemap=xml"
  523.             type="application/xml"/>
  524.         <meta name="robots" content="noindex, follow"/>
  525.         <title>Sitemap for scripts' source code</title>
  526.         <meta name="description" content="
  527.             Sitemap of all scripts available through /read/.
  528.         "/>
  529.     </head>
  530.     <body>
  531.         __NAVIGATION__
  532.         <main><div id="content" class="sitemap">
  533.             <h1 id="title">Sitemap for scripts' source code</h1>
  534.             <p><a href="{my_url}?path=/">Root directory</a></p>
  535.             <dl>
  536. '''.format(my_url=my_url, canonical_url=canonical_url))
  537.         #
  538.         indent = 16 * ' '
  539.         for path, description in paths:
  540.             compressout.write_b(indent + htmlescape.escape(
  541.                 '''<dt><a translate="no" href="{my_url}?path={path}">
  542.                     {path}
  543.                 </a></dt>\n''',
  544.                 path=(0, path),
  545.                 my_url=(0, canonical_url),
  546.             ))
  547.             compressout.write_b(indent +
  548.                 htmlescape.escape('<dd>{}</dd>\n', 0, description)
  549.             )
  550.         #
  551.         compressout.write_b('''            </dl>
  552.         </div></main>
  553.         __FOOTER__
  554.     </body>
  555. </html>
  556. ''')
  557.     else:
  558.         assert False, "Neither 'xml' nor 'html'"
  559. def ls(path, referer):
  560.     '''
  561.     '''
  562.     compressout.write_h(html_page)
  563.     compressout.write_h('\n')
  564.     if os.getenv('REQUEST_METHOD') == 'HEAD':
  565.         return
  566.     compressout.write_b('''__HTML5NC__''')
  567.     compressout.write_b(htmlescape.escape('''
  568.         <link rel="stylesheet" type="text/css" href="/read/style.css"/>
  569.         <title>Index of {name}</title>
  570.         <meta name="robots" content="{robots_follow}, noindex"/>
  571.         <link rel="canonical" href="{canonical_url}?path={name}"/>
  572.     </head>
  573.     <body>
  574.         {navigation}
  575.         <main><div id="content" class="ls">
  576.             <h1 id="title">Index of <span translate="no">{name}</span></h1>
  577.             <p class="read-nav">
  578.                 {isroot_commentout_start}
  579.                     <a href="{my_url}?path={parent_path}{referer_params}">
  580.                         Parent directory
  581.                     </a>
  582.                 {isroot_commentout_end}
  583.                 <a href="{my_url}?sitemap=html">CGIread sitemap</a>
  584.                 <a href="{my_url}">Main page</a>
  585.             </p>
  586.             <table id="ls">
  587.             ''',
  588.             name          =(1, path[rootlen:] + '/'),
  589.             parent_path   =(2, '/'.join(path.split('/')[:-1])[rootlen:]+'/'),
  590.             robots_follow =(2, 'no'*noindex(path)+'follow'),
  591.             navigation    =(0, mk_navigation(
  592.                                 referer,
  593.                                 "Index of "+path[rootlen:]+'/'
  594.                             )),
  595.             referer_params=(2, mk_referer_param(referer)),
  596.             my_url=(0, my_url),
  597.             canonical_url=(0, canonical_url),
  598.             isroot_commentout_start=(0, '<!--'*(path == root)),
  599.             isroot_commentout_end=(0, '-->'*(path == root)),
  600.         ))
  601.     no_access = conf['noaccess'] + conf['hide'] + conf['topsecret']
  602.     
  603.     for x in sorted(os.listdir(path)):
  604.         full_path = os.path.join(path, x)
  605.         
  606.         forbidden = False
  607.         for regex in no_access:
  608.             if re.match(regex, full_path[rootlen:]) is not None:
  609.                 forbidden = True
  610.                 break
  611.         if forbidden:
  612.             continue
  613.         
  614.         #url = cgi.escape(full_path, quote=True)
  615.         try:
  616.             os.listdir(full_path)
  617.             is_dir = 1
  618.         except:
  619.             is_dir = 0
  620.         # mobile_desc
  621.         # desktop_desc
  622.         if is_dir:
  623.            mobile_desc = '<span class="yeah">-&gt;</span>'
  624.            desktop_desc = '<span class="yeah">Directory</span>'
  625.         else:
  626.             content = open(full_path).read()
  627.             try:
  628.                 content.decode('UTF-8')
  629.                 binary = False
  630.             except:
  631.                 binary = True
  632.             if binary:
  633.                 desktop_desc = 'Binary'
  634.                 mobile_desc = ':-('
  635.             else:
  636.                 good, title, meta_d, onpage_d = mk_description(full_path)
  637.                 if good == 2:
  638.                     desktop_desc = htmlescape.escape(
  639.                         '<span class="thenumberofthebeast">{}</span>',
  640.                         1, meta_d
  641.                     )
  642.                     if noindex(full_path):
  643.                         mobile_desc = '<span class="yeah">:-)</span>'
  644.                     else:
  645.                         mobile_desc = '<span class="thenumberofthebeast">:-D</span>'
  646.                 elif not noindex(full_path):
  647.                     mobile_desc = '<span class="yeah">:-)</span>'
  648.                     desktop_desc = '<span class="yeah">Text; indexable</span>'
  649.                 else:
  650.                     mobile_desc = ':-|'
  651.                     desktop_desc = 'Boring; unindexable'
  652.                     
  653.         compressout.write_b(
  654.             htmlescape.escape(
  655.                 '''<tr><td class="mobile">{mobile_desc}</td>
  656.                 <td><a translate="no"
  657.                     href="{site}?path={path}{referer}">{text}</a></td>
  658.                 <td class="desktop">{desktop_desc}</td></tr>
  659.                 ''',
  660.                 site=(0, my_url),
  661.                 path=(2, full_path[rootlen:] + '/'*is_dir),
  662.                 referer=(2, mk_referer_param(referer)),
  663.                 text=(1, x + '/'*is_dir),
  664.                 mobile_desc=(0, mobile_desc),
  665.                 desktop_desc=(0, desktop_desc),
  666.             )
  667.         )
  668.     compressout.write_b('''            <!--</p>--></table>
  669.         </div></main>
  670.         __FOOTER__
  671.     </body>
  672. </html>\n''')
  673. def download(path):
  674.     if noindex(path):
  675.         compressout.write_h('X-Robots-Tag: noindex\n')
  676.     else:
  677.         compressout.write_h('X-Robots-Tag: index\n') # For verbosity.
  678.     content = open(path).read()
  679.     try:
  680.         content.decode('utf-8')
  681.         compressout.write_h('Content-Type: text/plain; charset=UTF-8\n')
  682.         compressout.write_h(htmlescape.escape(
  683.                 'Link: <{}?path={}>',
  684.                 0, canonical_url,
  685.                 2, path[rootlen:]
  686.             ) + '; rel="canonical"; type="text/html"\n'
  687.         )
  688.     except:
  689.         compressout.write_h(htmlescape.escape(
  690.             'Link: <{}?path={}>; rel="canonical"\n',
  691.             0, canonical_url,
  692.             2, path[rootlen:]
  693.         )) # No type specified.
  694.     if if_none_match(path):
  695.         compressout.write_h('\n')
  696.         if os.getenv('REQUEST_METHOD') != 'HEAD':
  697.             compressout.write_b(content)
  698. def cat(path, referer):
  699.     '''
  700.     '''
  701.     def ol_content(text):
  702.         out_lines = []
  703.         ids = []
  704.         allowed_chars = string.letters + '_-'
  705.         for index, line in enumerate(text.split('\n')):
  706.             # Create a "permanent" fragment this line.
  707.             this_id = ''
  708.             # Find ids in Python and XHTML
  709.             for decltype in ('def', 'class'):
  710.                 if line.strip().startswith(decltype + ' ') and '(' in line:
  711.                     this_id = line.split(decltype, 1)[1].split('(')[0].strip()
  712.             if 'id="' in line:
  713.                 this_id = line.split('id="')[1].split('"')[0]
  714.             # Prevent bad ids.
  715.             for ch in this_id:
  716.                 if ch not in allowed_chars:
  717.                     this_id = ''
  718.                     break
  719.             if this_id in ids:
  720.                 this_id = ''
  721.             # Create the fragment identifier for the line.
  722.             if this_id:
  723.                 ids.append(this_id)
  724.                 idline = 'id="content_{}"'.format(this_id)
  725.             else:
  726.                 idline = ''
  727.             # Create line
  728.             out_lines.append(htmlescape.escape(
  729.                     '    <li id="{}"><pre translate="no" {}>{}</pre></li>\n',
  730.                     0, index + 1,
  731.                     0, idline,
  732.                     1, line,
  733.             ))
  734.         fragment_links = []
  735.         for fragment in sorted(ids):
  736.             fragment_links.append(
  737.                 (
  738.                     '<a class="quick" href="#content_{0}" translate="no"' +
  739.                     '>{0}</a>\n'
  740.                 ).format(
  741.                     fragment
  742.                 )
  743.             )
  744.         return ''.join(out_lines), ''.join(fragment_links)
  745.     
  746.     content = open(path).read()
  747.     try:
  748.         content.decode('utf-8')
  749.     except:
  750.         if noindex(path):
  751.             compressout.write_h('X-Robots-Tag: noindex\n')
  752.         else:
  753.             compressout.write_h('X-Robots-Tag: index\n')
  754.         compressout.write_h('\n')
  755.         compressout.write_b(content)
  756.         return
  757.     compressout.write_h(html_page)
  758.     compressout.write_h('\n')
  759.     if os.getenv('REQUEST_METHOD') == 'HEAD':
  760.         return
  761.     
  762.     ignore, title, meta_description, p_description = mk_description(path)
  763.     last_modified = time.strftime('%F', time.gmtime(os.stat(path).st_mtime))
  764.     
  765.     lines, fragment_links = ol_content(content)
  766.     if not fragment_links:
  767.         fragment_links = '(none)'
  768.     
  769.     compressout.write_b('''__HTML5NC__''')
  770.     compressout.write_b('''
  771. <script type="application/ld+json">
  772. {
  773.     "@context":
  774.     {
  775.         "@vocab": "http://schema.org/"
  776.     },
  777.     "@type": "SoftwareSourceCode",
  778.     "license": "https://opensource.org/licenses/BSD-2-Clause",
  779.     "author":
  780.     {
  781.     ''')
  782.     compressout.write_b('''
  783.         "@type": "Person",
  784.         "@id": "__SITE__/",
  785.         "name": "{0}",
  786.         "url": "__SITE__/"
  787.     '''.format(owner))
  788.     compressout.write_b('''
  789.     },
  790.     "publisher": {"@id": "__SITE__/"},
  791.     "copyrightHolder": {"@id": "__SITE__/"},
  792.     ''')
  793.     compressout.write_b('''
  794.     "url": "{}#code",
  795.     "DateModified": "{}"
  796.     '''.format(
  797.         canonical_url + '?path=' + path[rootlen:],
  798.         last_modified,
  799.     ))
  800.     compressout.write_b('''
  801. }
  802. </script>
  803.     ''')
  804.     parent_link = '/'.join(path.split('/')[:-1])[rootlen:]+'/'
  805.     compressout.write_b(htmlescape.escape('''
  806.         <link rel="stylesheet" type="text/css" href="/read/style.css"/>
  807.         <title>{title}</title>
  808.         <link rel="canonical" href="{canonical}"/>
  809.         <link
  810.             rel="alternate"
  811.             href="{canonical}&amp;download=yes"
  812.             type="text/plain"
  813.         />
  814.         <meta name="robots" content="{noindex_no}index"/>
  815.         <meta name="description" content="{meta_description}"/>
  816.     </head>
  817.     <body>
  818.         {navigation}
  819. <main><div id="content">
  820.     <h1 id="title" translate="no">{title}</h1>
  821.     <div id="description">
  822.         {content_description}
  823.     </div>
  824.     <table>
  825.         <tr>
  826.             <td>Last modified</td>
  827.             <td><time datetime="{last_modified}">{last_modified}</time></td>
  828.         </tr>
  829.         <tr>
  830.             <td>Lines</td>
  831.             <td>{linecount}</td>
  832.         </tr>
  833.         <tr>
  834.             <td>Indexable</td>
  835.             <td>{indexable}</td>
  836.         </tr>
  837.     </table>
  838.     <p class="notprint read-nav">
  839.         <a href="{my_url}?path={parent_dir}">Parent directory</a>
  840.         <a href="{my_url}?path={path}&amp;download=yes" target="_blank">Download</a>
  841.         <a href="{my_url}?sitemap=html">CGIread sitemap</a>
  842.         <a href="{my_url}">Main page</a>
  843.     </p>
  844.     <p class="notprint">
  845.         Quick links:\n{fragments}
  846.     </p>
  847. <ol id="code">
  848. {content}
  849. </ol>
  850. </div></main>
  851. ''',
  852.         title=(2, title),
  853.         content=(0, lines),
  854.         parent_dir=(2, parent_link + mk_referer_param(referer)),
  855.         navigation=(0, mk_navigation(referer, path[rootlen:])),
  856.         canonical=(2, canonical_url + '?path=' + path[rootlen:]),
  857.         path=(2, path[rootlen:]),
  858.         noindex_no=(2, 'no' * noindex(path)),
  859.         meta_description=(2, meta_description),
  860.         content_description=(0, p_description),
  861.         last_modified=(2, last_modified),
  862.         linecount=(1, content.count('\n') + 1),
  863.         indexable=(0, {True: 'No', False: 'Yes'}[noindex(path)]),
  864.         fragments=(0, fragment_links),
  865.         my_url=(0, my_url),
  866.     ))
  867.     compressout.write_b('''
  868.         __FOOTER__
  869.     </body>
  870. </html>
  871. ''')
  872. def if_none_match(path):
  873.     '''
  874.     ETag handling for `cat`, `ls` and `download`:
  875.     
  876.     
  877.     Returns `True` if content needs to be generated.
  878.     Outputs necessary headers and 304 statuses.
  879.     '''
  880.     try:
  881.         meta_time = os.stat(path + '.info').st_mtime
  882.     except:
  883.         meta_time = 0
  884.     ETag = '"{}{}-{}({})-{}-({}-{})"'.format(
  885.         'x'*('application/xhtml+xml' in html_page),
  886.         'z'*('gzip' in os.getenv('HTTP_ACCEPT_ENCODING', '')),
  887.         os.stat(path).st_mtime,
  888.         meta_time,
  889.         base64.b64encode(os.getenv('QUERY_STRING', '')),
  890.         os.stat('index.py').st_mtime,
  891.         os.stat('read.cfg').st_mtime,
  892.     )
  893.     compressout.write_h('Vary: If-None-Match\n')
  894.     compressout.write_h('ETag: {}\n'.format(ETag))
  895.     compressout.write_h(
  896. '''X-ETag-Synopsis: [x][z]-<f_time>(<m_time>)-<query>-(<s_time>-<c_time>)
  897. X-ETag-Description-x: "Client accepts application/xhtml+xml"
  898. X-ETag-Description-z: "Content-Encoding: gzip"
  899. X-ETag-Description-f_time: "Unix last modified time for the requested file"
  900. X-ETag-Description-m_time: "Unix last modified time for the file's metadata"
  901. X-ETag-Description-query: "base64 encoded $QUERY_STRING"
  902. X-ETag-Description-s_time: "Unix last modified time for '/read/index.py'"
  903. X-ETag-Description-c_time: "Unix last modified time for '/read/read.cfg'"
  904. ''')
  905.     if os.getenv('HTTP_IF_NONE_MATCH', '') == ETag:
  906.         compressout.write_h('Status: 304\n\n')
  907.         return False
  908.     else:
  909.         return True
  910. def is_injection_attempt(path_param, referer_URI, referer_title):
  911.     '''
  912.     Various checks to see if any form of injection attempt has been
  913.     made.  This function checks the `path`, `referer` and `title`
  914.     parameters.
  915.     
  916.     Returns True if the request is an injection attempt.
  917.     
  918.     - XSS
  919.     - URL injection
  920.     - Spam injection
  921.     - Restricted files access
  922.     '''
  923.     # If the path parameter contains an XSS attempt, it can't be corrected
  924.     evil = False
  925.     # Prevent attacks.
  926.     if '..' in path_param:
  927.         return True
  928.     for var in referer_URI, referer_title:
  929.         for ch in var:
  930.             if ord(ch) < 32:
  931.                 return True
  932.             if ch in '<>&\'"':
  933.                 return True
  934.             # NOTICE: The following will limit parameters to ASCII.
  935.             if ord(ch) > 126:
  936.                 return True
  937.     # Prevent linking to Mallory.
  938.     for start in ('http://', 'https://', '//', 'ftp://'):
  939.         if referer_URI.startswith(start):
  940.             hostname = referer_URI.split('//')[1].split('/')[0]
  941.             if hostname not in conf['allowed-referer-hosts']:
  942.                 return True
  943.             else:
  944.                 break
  945.     else:
  946.         if ':' in referer_URI:
  947.             return True
  948.     # Prevent injected spam
  949.     if spammy.spammy(referer_title) or len(referer_title) > 42:
  950.         return True
  951.     # No match.
  952.     return False
  953. def handle_injection_attempt(path_param, referer_URI, referer_title):
  954.     '''
  955.     Decide if the injection attempt was due to innocently following
  956.     a malicious link or due to creating one.
  957.     '''
  958.     # Check if the URL can be sanitized.
  959.     if is_injection_attempt(path_param, '', ''):
  960.         destination = 'https://en.wikipedia.org/wiki/Data_validation'
  961.     else:
  962.         destination = my_url + '?path=' + path_param
  963.     redirect_spam(destination)
  964. def main():
  965.     '''
  966.     `compressout.init` MUST be called before `main`
  967.     and `compressout.done` after.
  968.     '''
  969.     # HTML vs XHTML
  970.     global html_page
  971.     html_page = 'Vary: Accept\n'
  972.     if 'application/xhtml+xml' in os.getenv('HTTP_ACCEPT', ''):
  973.         html_page += 'Content-Type: application/xhtml+xml; charset=UTF-8\n'
  974.     else:
  975.         html_page += 'Content-Type: text/html; charset=UTF-8\n'
  976.     # Check that the method is either GET, HEAD or OPTIONS.
  977.     if os.getenv('REQUEST_METHOD') not in ('GET', 'HEAD'):
  978.         if os.getenv('REQUEST_METHOD') != 'OPTIONS':
  979.             compressout.write_h('Status: 405\n')
  980.         compressout.write_h('Allow: GET, HEAD, OPTIONS\n')
  981.         compressout.write_h('Content-Type: text/plain\n')
  982.         compressout.write_h('\n')
  983.         if os.getenv('REQUEST_METHOD') != 'OPTIONS':
  984.             compressout.write_b('Method not allowed!\n')
  985.         compressout.write_b('Allowed methods: GET, HEAD, OPTIONS\n')
  986.         return
  987.     # Get the parameters.
  988.     params = cgi.FieldStorage()
  989.     path = path_param = params.getfirst('path', default='')
  990.     referer_URI = params.getfirst('referer', default='')
  991.     referer_title = params.getfirst('title', default='Back')
  992.     referer = (referer_URI, referer_title)
  993.     download_flag = params.getfirst('download', default='no')
  994.     sitemap_param = params.getfirst('sitemap', default='none')
  995.     
  996.     if not os.getenv('QUERY_STRING'):
  997.         index_page()
  998.         return
  999.         
  1000.     # Bad request, but will match the evil patterns.
  1001.     # Keep it before the evil stopper.
  1002.     if bool(path_param) and not path_param.startswith('/'):
  1003.         status400('`path` is not relative to this site. (No leading slash.)')
  1004.         return
  1005.     
  1006.     # Do not allow evil requests.
  1007.     allow = True
  1008.     # Keep things within the server root.
  1009.     try:
  1010.         path = os.path.realpath(root + path)
  1011.     except:
  1012.         allow = False
  1013.     if path != root and not path.startswith(root + '/'):
  1014.         allow = False
  1015.     # Stop at forbidden paths. #1/2
  1016.     for regex in conf['noaccess']:
  1017.         if re.match(regex, path[rootlen:]) is not None:
  1018.             allow = False
  1019.     
  1020.     # Prevent XSS, URL injection, spam injection and miscellaneous assholery.
  1021.     if is_injection_attempt(path_param, referer_URI, referer_title):
  1022.         allow = False
  1023.     if not allow:
  1024.         handle_injection_attempt(path_param, referer_URI, referer_title)
  1025.         return
  1026.     
  1027.     # Bad requests:
  1028.     if download_flag not in ('yes', 'no'):
  1029.         status400('`download` MUST be "yes", "no" or unset.')
  1030.         return
  1031.     if bool(path_param) and sitemap_param != 'none':
  1032.         status400('The `sitemap` parameter cannot be used with any other.')
  1033.         return
  1034.     if download_flag == 'yes' and bool(referer_URI):
  1035.         status400("`download=yes` can't be used with the `referer` parameter.")
  1036.         return
  1037.     if sitemap_param not in ('none', 'xml', 'html'):
  1038.         status400('`sitemap` MUST be "html", "xml" or unset.')
  1039.         return
  1040.     if download_flag == 'yes' and not bool(path_param):
  1041.         status400('Nothing to `download`. Use the `path` parameter.')
  1042.         return
  1043.     if bool(referer_URI) and not bool(path_param):
  1044.         status400('`referer` cannot be used without `path`')
  1045.         return
  1046.     if referer_title != 'Back' and not bool(referer_URI):
  1047.         status400('`referer` is not set.')
  1048.         return
  1049.     
  1050.     if allow:
  1051.     # Generate sitemap?
  1052.         if sitemap_param != 'none':
  1053.             sitemap(sitemap_param)
  1054.         else:
  1055.             # Stop at forbidden paths. #2/2
  1056.             for regex in conf['topsecret']:
  1057.                 if re.match(regex, path[rootlen:]) is not None:
  1058.                     status404()
  1059.                     break
  1060.             else:
  1061.                 # Allowed to be seen.
  1062.                 try:
  1063.                     os.listdir(path)
  1064.                     if download_flag == 'no':
  1065.                         if if_none_match(path):
  1066.                             ls(path, referer)
  1067.                     else:
  1068.                         status400("Can't download a directory.")
  1069.                 except OSError as e:
  1070.                     if e.errno == errno.ENOTDIR:
  1071.                         if download_flag == 'no':
  1072.                             if if_none_match(path):
  1073.                                 cat(path, referer)
  1074.                         else:
  1075.                             # `download` sets a few headers.
  1076.                             download(path)
  1077.                     elif e.errno == errno.ENOENT:
  1078.                         status404()
  1079.                     else:
  1080.                         raise ValueError(
  1081.                             'errno must be either ENOTDIR or ENOENT'
  1082.                         )
  1083. if __name__ == '__main__':
  1084.     compressout.init()
  1085.     main()
  1086.     compressout.done()