CGI source code reader script

This is (the source of) the script that generates this very page.

Through this, you can see the source code for all the scripts on my site.

    Requirements for a file for its /read/ page to be indexable:
  • Always indexable if whitelisted
  • Not manually blacklisted
  • Not made from another file
  • Text file
  • At least 3072/1536/1024 Unicode code points
  • At least 300/150/100 "words"
  • At least 60/30/20 lines
  • At least 24/12/8 comments
Last modified
Lines 1125
Indexable Yes

Parent directory Download CGIread sitemap Main page

Quick links: cat code content description download forms handle_injection_attempt if_none_match index_page is_injection_attempt isword ls main mk_description mk_navigation mk_referer_param navigation noindex ol_content redirect_spam sitemap syntax title

  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. root = '/var/www'
  4. owner = 'Oskar Skog'
  5. my_url = '/read/'
  6. canonical_url = 'https://__HOST__/read/'
  7. html403file = '/var/www/oops/403.html'
  8. html404file = '/var/www/oops/404.html'
  9. html503file = '/var/www/oops/cgi503.html'
  10. import sys
  11. sys.path.append(root)
  12. import cgi
  13. import os
  14. import errno
  15. import compressout
  16. import base64
  17. import re
  18. import time
  19. import htmlescape
  20. import string
  21. import spammy
  22. import sitemap as mod_sitemap  # Name conflict with already existing function.
  23. import cgitb
  24. cgitb.enable()
  25. rootlen = len(root)
  26. #html_mime = 'text/html'      # Set to XHTML later.
  27. html_page = 'Content-Type: text/html; charset=UTF-8\n'  # Set to XHTML later.
  28. conf = eval(open('read.cfg').read())
  29. def redirect_spam(destination):
  30.     '''`destination` is the URL to which assholes should be redirected.'''
  31.     compressout.write_h('Status: 303\n')
  32.     compressout.write_h('Location: {}\n'.format(destination))
  33.     compressout.write_h('\n')
  34. def status400(message):
  35.     '''HTTP 400; `message` goes UNESCAPED inside a <pre> element.'''
  36.     compressout.write_h('Status: 400\n')
  37.     compressout.write_h(html_page)
  38.     compressout.write_h('\n')
  39.     compressout.write_b('''__HTML5__
  40.         <title>400 - Bad Request</title>
  41.     </head>
  42.     <body>
  43.         __NAVIGATION__
  44.         <main><div id="content">
  45.             <h1 id="title">400 - Bad Request</h1>
  46.             <pre>{}</pre>
  47.             <p>
  48.                 Your request can't be understood.
  49.                 Check the parameters.
  50.             </p>
  51.             <p><a href="/read/">Documentation for the parameters</a></p>
  52.         </div></main>
  53. '''.format(message))
  54.     compressout.write_b('''
  55.         __FOOTER__
  56.     </body>
  57. </html>''')
  58. def status403():
  59.     '''HTTP 403'''
  60.     compressout.write_h(html_page)
  61.     compressout.write_h('Status: 403\n\n')
  62.     compressout.write_b(open(html403file).read())
  63. def status404():
  64.     '''HTTP 404'''
  65.     compressout.write_h('Status: 404\n')
  66.     compressout.write_h(html_page)
  67.     compressout.write_h('\n')
  68.     compressout.write_b(open(html404file).read())
  69. def status503():
  70.     '''
  71.     HTTP 503
  72.     
  73.     Call this if there is too much load on the server to do something.
  74.     (Used by the sitemap function.)
  75.     '''
  76.     compressout.write_h('Status: 503\n')
  77.     compressout.write_h(html_page)
  78.     # One factor is load avg for 1 minute, add some slop to the delay for bots.
  79.     compressout.write_h('Retry-After: 90\n')
  80.     compressout.write_h('\n')
  81.     compressout.write_b(open(html503file).read())
  82. def index_page():
  83.     '''https://oskog97.com/read/'''
  84.     # Handle 304s.
  85.     ETag = '"{}{}{}"'.format(
  86.         'x'*('application/xhtml+xml' in html_page),
  87.         'z'*('gzip' in os.getenv('HTTP_ACCEPT_ENCODING', '')),
  88.         os.stat('index.py').st_mtime,
  89.     )
  90.     compressout.write_h('Vary: If-None-Match\n')
  91.     compressout.write_h('ETag: {}\n'.format(ETag))
  92.     compressout.write_h(html_page)
  93.     if os.getenv('HTTP_IF_NONE_MATCH') == ETag:
  94.         compressout.write_h('Status: 304\n\n')
  95.         return
  96.     compressout.write_h('\n')
  97.     if os.getenv('REQUEST_METHOD') == 'HEAD':
  98.         return
  99.     # Write out a static page.
  100.     compressout.write_b('''__HTML5__
  101.     <!-- With canonical link tag. -->
  102.         <link rel="stylesheet" type="text/css" href="/read/style.css"/>
  103.         <meta name="description" content="Interested in the scripts I have
  104.         on my website? Come and take a look at them."/>
  105.         __TITLE__
  106.     </head>
  107.     <body>
  108.         __NAVIGATION__
  109.         <main><div id="content">
  110.             __H1__
  111.     ''')
  112.     compressout.write_b('''
  113.             <p>
  114.                 Interested in the scripts I have on my website?
  115.                 Go take a look at them; start crawling the
  116.                 <a href="{0}?path=/">root directory</a> or take a look
  117.                 at the <span class="a"><a href="{0}?sitemap=html"
  118.                 >(sub)sitemap</a>.</span>
  119.             </p>
  120.             <div id="syntax">
  121.                 <h2>Parameter syntax</h2>
  122.                 <p>
  123.                     Descriptions for the parameters can be found in
  124.                     the request forms.
  125.                 </p>
  126.                 <ul>
  127.                     <li>
  128.                         Asterisks <q>*</q> represent a value that can be
  129.                         (almost) anything.
  130.                     </li>
  131.                     <li>Square brackets <q>[]</q> represent optional.</li>
  132.                     <li>Curly brackets <q>&#x7b;&#x7d;</q> represent mandatory.</li>
  133.                     <li>Pipes <q>|</q> represent either or.</li>
  134.                 </ul>
  135.                 <p>There are three acceptable "sets" of parameters:</p>
  136.                 <ol>
  137. <li><pre>{0}?sitemap=&#x7b;html|xml&#x7d;</pre></li>
  138. <li><pre>{0}?path=*[&amp;download=yes]</pre></li>
  139. <li><pre>{0}?path=*[&amp;referer=*[&amp;title=*]]</pre></li>
  140.                 </ol>
  141.                 <p>
  142.                     The order of the valid parameters doesn't matter, but
  143.                     this is the recommended/canonical order.
  144.                 </p>
  145.             </div>
  146.             <div id="forms">
  147.                 <h2>Request forms</h2>
  148.                 <p><strong>
  149.                     Notice that these are three different forms.
  150.                 </strong></p>
  151.                 <form action="{0}" method="get">
  152.                 <h3>Sitemap</h3>
  153.                 <p>
  154.                     The <code>sitemap</code> parameter can be either
  155.                     <q><code>html</code></q>, <q><code>xml</code></q>
  156.                     or the default <q><code>none</code></q>.
  157.                     It can't be used together with any other parameters.
  158.                 </p>
  159.                 <p>
  160.                     <input type="radio" name="sitemap" value="html"/>
  161.                     Request an HTML sitemap instead of a page<br/>
  162.                     <input type="radio" name="sitemap" value="xml"/>
  163.                     request an XML sitemap instead of a page<br/>
  164.                     <input type="submit"/>
  165.                 </p>
  166.                 </form>
  167.                 <form action="{0}" method="get">
  168.                 <h3>Page</h3>
  169.                 <p>
  170.                     A page (source code of a CGI script) is selected with the
  171.                     <code>path</code> parameter.  The value of the
  172.                     <code>path</code> parameter is a URL relative to this
  173.                     site, ie. an URL beginning with a single slash.
  174.                 </p>
  175.                 <p>
  176.                     The <code>path</code> is the site-local URL to the CGI
  177.                     script or directory you're interested in.  If you set the
  178.                     value to <q><code>/read/index.py</code></q>, you'll get the
  179.                     source code for this script. And if you set it to
  180.                     <q><code>/</code></q>, you'll get a directory listing
  181.                     of the site's root directory.
  182.                 </p>
  183.                 <p>
  184.                     Path/URL: <input type="text" name="path" value="/"/>
  185.                     <input type="submit"/><br/>
  186.                     <input type="checkbox" name="download" value="yes"/>
  187.                     Download / see it as plain text
  188.                     
  189.                 </p>
  190.                 <p>
  191.                     The <code>download</code> parameter can be set to either
  192.                     <q><code>yes</code></q> or the default
  193.                     <q><code>no</code></q>.  The download option does
  194.                     obviously not work with directories.
  195.                 </p>
  196.                 </form>
  197.                 <form action="{0}" method="get">
  198.                 <h3>Link back to a referencing page</h3>
  199.                 <p>
  200.                     If <code>download</code> is <q><code>no</code></q> or
  201.                     unset and a page (not a sitemap) was requested, it is
  202.                     possible to change the navigation to make the requested
  203.                     page link back to a referring page.
  204.                 </p>
  205.                 <p>
  206.                     The <code>referer</code> (yes, misspelled like the HTTP
  207.                     Referer) parameter is the URL of the referencing page.
  208.                     (Don't try to specify a site that isn't mine.)
  209.                     The <code>title</code> parameter gives the back link a
  210.                     different text than <q>Back</q>.
  211.                 </p>
  212.                 <table>
  213.                     <tr>
  214.                         <th><code>path</code></th>
  215.                         <td><input type="text" name="path" value="/"/></td>
  216.                     </tr>
  217.                     <tr>
  218.                         <th><code>referer</code></th>
  219.                         <td><input type="text" name="referer"/></td>
  220.                     </tr>
  221.                     <tr>
  222.                         <th><code>title</code></th>
  223.                         <td><input type="text" name="title"/></td>
  224.                     </tr>
  225.                     <tr>
  226.                         <td></td>
  227.                         <td><input type="submit"/></td>
  228.                     </tr>
  229.                 </table>
  230.                 </form>
  231.             </div>
  232.         </div></main>
  233.     '''.format(my_url))
  234.     compressout.write_b('''
  235.         __FOOTER__
  236.     </body>
  237. </html>
  238. ''')
  239. def noindex(path):
  240.     '''
  241.     Returns True if `path` should be noindexed.
  242.     
  243.     `path` is an absolute **filesystem** path.
  244.     '''
  245.     def isword(w):
  246.         letters = string.letters + ',.'
  247.         for ch in w:
  248.             if w not in letters:
  249.                 return False
  250.         return True
  251.     # 1. White list
  252.     # 2. Black list
  253.     # 3. Page quality (not applicable for directories)
  254.     
  255.     # Check whitelist first.
  256.     for regex in conf['doindex']:
  257.         if re.match(regex, path[rootlen:]) is not None:
  258.             return False
  259.             break
  260.     
  261.     # Blacklist (two kinds):
  262.     # - Generated from another file.
  263.     # - Explicitly blacklisted in 'read.cfg'.
  264.     for match, replace in conf['madefrom']:
  265.         if re.match(match, path[rootlen:]) is not None:
  266.             try:
  267.                 os.stat(root + re.sub(match, replace, path[rootlen:]))
  268.                 return True
  269.             except:
  270.                 pass
  271.     for regex in conf['noindex'] + conf['hide']:
  272.         if re.match(regex, path[rootlen:]) is not None:
  273.             return True
  274.     
  275.     # Quality:
  276.     #   - Text file
  277.     #   - At least 3072 Unicode code points
  278.     #   - At least 300 words
  279.     #   - At least 60 lines
  280.     #   - Half the limitations if a meta description and title is found
  281.     #   - A third of the limimitations if an onpage description is found
  282.     try:
  283.         os.listdir(path)
  284.         return False
  285.     except:
  286.         pass
  287.     # Normal file.
  288.     try:
  289.         text = open(path).read().decode('utf-8')
  290.     except:
  291.         return True
  292.     min_chars, min_words, min_lines, min_comments = 3072, 300, 60, 24
  293.     quality = mk_description(path)[0] + 1
  294.     min_chars //= quality; min_words //= quality
  295.     min_lines //= quality; min_comments //= quality
  296.     if len(text) < min_chars:
  297.         return True
  298.     if text.count('\n') + 1 < min_lines:
  299.         return True
  300.     n_comments = 0
  301.     is_comment = re.compile('^(.*#.*| *\\* .*|.*<!--.*|.*\'\'\'.*)$')
  302.     for line in text.split('\n'):
  303.         if re.match(is_comment, line) is not None:
  304.             n_comments += 1
  305.     if n_comments < min_comments:
  306.         return True
  307.     if len(filter(isword, text.replace('\n', ' ').split(' '))) < min_words:
  308.         return True
  309.     # Passed the quality tests:
  310.     return False
  311. def mk_navigation(referer, title):
  312.     '''
  313.     Returns a string which is the navigation bar's HTML.
  314.     
  315.     `title` is the title of the requested page.
  316.     
  317.     `referer` is used to **optionally** ``integrate`` a page.
  318.     `referer` is a tuple of (URL, title) for the "back" link.
  319.     '''
  320.     if referer[0]:
  321.         return htmlescape.escape('''<!-- Navigation generated by CGIread. -->
  322. <nav><div id="navigation"><div id="nav_inner">
  323. <p><a href="#content" class="textonly">Skip navigation</a></p>
  324. <p class="row">
  325. <span class="textonly" translate="no">[</span><a class="head" href="{URL}">{title}</a><span class="textonly" translate="no">]</span>
  326. &gt;&gt;
  327. <span class="textonly" translate="no">]</span><span class="sub active">{me}</span><span class="textonly" translate="no">[</span>
  328. <span class="textonly" translate="no">[</span><a class="sub" href="{my_url}?sitemap=html">Sitemap for website's scripts</a><span class="textonly" translate="no">]</span>
  329. </p>
  330. <p class="row">
  331. <span class="textonly" translate="no">[</span><a class="head" href="/">Home</a><span class="textonly" translate="no">]</span>
  332. &gt;&gt;
  333. <span class="textonly" translate="no">[</span><a class="sub" href="/read/">Website's scripts</a><span class="textonly" translate="no">]</span>
  334. <span class="textonly" translate="no">[</span><a class="sub" href="/pages/policy.html">Privacy policy &amp; terms of use</a><span class="textonly" translate="no">]</span>
  335. <span class="textonly" translate="no">[</span><a class="sub" href="/sitemap.py">Sitemap</a><span class="textonly" translate="no">]</span>
  336. </p>
  337. <hr class="textonly"/>
  338. </div></div></nav>
  339. <!-- End of navigation. -->''',
  340.             URL=(2, referer[0]),
  341.             title=(1, referer[1]),
  342.             me=(1, title),
  343.             my_url=(0, my_url),
  344.         )
  345.     else:
  346.         return '''__NAVIGATION__'''
  347. def mk_referer_param(referer):
  348.     '''Returns one of:
  349.         ''
  350.         '&referer=' + referer[0]
  351.         '&referer=' + referer[0] + '&title=' + referer[1]
  352.     to be added to links from the requested page.
  353.     
  354.     `referer` is used to **optionally** ``integrate`` a page.
  355.     See `mk_navigation`
  356.     '''
  357.     if referer[0]:
  358.         if referer[1] != 'Back':
  359.             title = '&title={}'.format(referer[1])
  360.         else:
  361.             title = ''
  362.         return '&referer={}{}'.format(referer[0], title)
  363.     else:
  364.         return ''
  365. def mk_description(path):
  366.     '''
  367.     Return three strings: (good, title, meta_description, onpage_description)
  368.     
  369.     `path` is the absolute filesystem path to the requested page.
  370.     
  371.     `good` is
  372.         0       no title and description
  373.         1       title and meta description only
  374.         2       also an onpage description
  375.     
  376.     `title` is the title of the page.
  377.     
  378.     `meta_description` is the content of the description meta tag.
  379.     
  380.     `onpage_description` is HTML content for the onpage description.
  381.     requested page.
  382.     '''
  383.     good = 0
  384.     title = "source code of {}".format(path[rootlen:])
  385.     meta_description = ''
  386.     onpage_description = None
  387.     try:
  388.         content = open(path + '.info').read().split('\n')
  389.         good = 1
  390.     except:
  391.         pass
  392.     if good:
  393.         title = content[0]
  394.         try:
  395.             sep = content.index('.')
  396.         except ValueError:
  397.             sep = None
  398.         if sep is not None:
  399.             good = 2
  400.             meta_description = '\n'.join(content[1:sep])
  401.             onpage_description = '\n'.join(content[sep+1:])
  402.         else:
  403.             meta_description = '\n'.join(content[1:])
  404.     if onpage_description is None:
  405.         onpage_description = htmlescape.escape('<p>{}</p>',1,meta_description)
  406.     return good, title, meta_description, onpage_description
  407. def sitemap(sitemap_type):
  408.     '''
  409.     Write out an XML or HTML sitemap.
  410.     sitemap_type in ('xml', 'html')
  411.     
  412.     The XML sitemap will exclude entries from `conf['noxmlsitemap']`.
  413.     '''    
  414.     
  415.     if os.getenv('REQUEST_METHOD') != 'HEAD': # NOTICE
  416.         # Prevent over-revving the server.
  417.         # HEAD requests are basically no-ops.
  418.         maxload = conf['sitemap-maxload']
  419.         if os.getloadavg()[0] > maxload['load-avg1']:
  420.             status503()
  421.             return
  422.         access_times = map(
  423.             float, open('read.throttlecontrol').read().strip().split(':')
  424.         )
  425.         if time.time() - access_times[-1] < maxload['throttle-time']:
  426.             status503()
  427.             return
  428.         access_times.insert(0, time.time())
  429.         access_times = access_times[:maxload['throttle-requests']]
  430.         f = open('read.throttlecontrol', 'w')
  431.         f.write(':'.join(map(str, access_times)) + '\n')
  432.         f.close()
  433.     # Write headers before doing anything else.
  434.     # A HEAD request doesn't need to know the length (it's TE chunked).
  435.     if sitemap_type == 'xml':
  436.         compressout.write_h('Content-Type: application/xml; charset=UTF-8\n')
  437.         compressout.write_h(
  438.             'Link: <{my_url}?sitemap=html>'.format(my_url=canonical_url) +
  439.             '; rel="canonical"' +
  440.             '; type="text/html"\n'
  441.         )
  442.         compressout.write_h('X-Robots-Tag: noindex\n\n') # NOTE: last.
  443.     elif sitemap_type == 'html':
  444.         compressout.write_h(html_page)
  445.         compressout.write_h('\n')
  446.     else:
  447.         assert False, "Neither 'xml' nor 'html'"
  448.     if os.getenv('REQUEST_METHOD') == 'HEAD': # NOTICE
  449.         return
  450.     
  451.     # Find the pages worth being in the sitemap.
  452.     no_access = conf['noaccess'] + conf['hide'] + conf['topsecret']
  453.     paths = []
  454.     
  455.     for basedir, dirs, files in os.walk(root, topdown=True):
  456.         # Exclude hidden directories:
  457.         remove_list = []
  458.         for dirname in dirs:
  459.             dirpath = os.path.join(basedir, dirname)[rootlen:]
  460.             for regex in no_access:
  461.                 if re.match(regex, dirpath) is not None:
  462.                     #dirs.remove(dirname)
  463.                     # BUG: The for loop will skip items in the list if
  464.                     # other items are removed while looping.
  465.                     # This caused some real' nasty stuff like sshin to
  466.                     # be crawled, took a whopping .65 seconds.
  467.                     remove_list.append(dirname)
  468.                     break
  469.         for dirname in remove_list:
  470.             dirs.remove(dirname)
  471.         
  472.         # Iterate over files:
  473.         for filename in files:
  474.             filepath = os.path.join(basedir, filename)
  475.             # No symlinks allowed.
  476.             if os.stat(filepath).st_mode == os.lstat(filepath).st_mode:
  477.                 #try:
  478.                     description = mk_description(filepath)
  479.                     if description[0]:
  480.                         # Only indexable content allowed.
  481.                         if not noindex(filepath):
  482.                             paths.append((filepath[rootlen:], description[3]))
  483.                 #except IOError as error:
  484.                     #assert error.errno in (
  485.                         #errno.EISDIR, errno.EACCES
  486.                     #), error.errno
  487.     
  488.     paths.sort(key=lambda x: x[0])
  489.     
  490.     # Print the body.
  491.     if sitemap_type == 'xml':
  492.         compressout.write_b('''<?xml version="1.0" encoding="UTF-8"?>
  493. <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  494. ''')
  495.         #
  496.         for path, description in paths:
  497.             # Loop through all the regexes:
  498.             for regex in conf['noxmlsitemap']:
  499.                 if re.match(regex, path) is not None:
  500.                     break
  501.             else:
  502.                 compressout.write_b(htmlescape.escape('''<url>
  503.     <loc>{canonical_url}?path={path}</loc>
  504.     <priority>0.5</priority>
  505. ''',
  506.                     canonical_url=(0, canonical_url),
  507.                     path=(1, path),
  508.                 ))
  509.                 mod_sitemap.lastmod_changefreq(
  510.                     root + path,
  511.                     compressout,
  512.                 )
  513.                 compressout.write_b('</url>\n')
  514.         #
  515.         compressout.write_b('</urlset>\n')
  516.     elif sitemap_type == 'html':
  517.         compressout.write_b('''__HTML5NC__
  518.         <link rel="canonical" href="{canonical_url}?sitemap=html"/>
  519.         <link rel="alternate" href="{canonical_url}?sitemap=xml"
  520.             type="application/xml"/>
  521.         <meta name="robots" content="noindex, follow"/>
  522.         <title>Sitemap for scripts' source code</title>
  523.         <meta name="description" content="
  524.             Sitemap of all scripts available through /read/.
  525.         "/>
  526.     </head>
  527.     <body>
  528.         __NAVIGATION__
  529.         <main><div id="content" class="sitemap">
  530.             <h1 id="title">Sitemap for scripts' source code</h1>
  531.             <p><a href="{my_url}?path=/">Root directory</a></p>
  532.             <dl>
  533. '''.format(my_url=my_url, canonical_url=canonical_url))
  534.         #
  535.         indent = 16 * ' '
  536.         for path, description in paths:
  537.             compressout.write_b(indent + htmlescape.escape(
  538.                 '''<dt><a translate="no" href="{my_url}?path={path}">
  539.                     {path}
  540.                 </a></dt>\n''',
  541.                 path=(0, path),
  542.                 my_url=(0, canonical_url),
  543.             ))
  544.             compressout.write_b(indent +
  545.                 htmlescape.escape('<dd>{}</dd>\n', 0, description)
  546.             )
  547.         #
  548.         compressout.write_b('''            </dl>
  549.         </div></main>
  550.         __FOOTER__
  551.     </body>
  552. </html>
  553. ''')
  554.     else:
  555.         assert False, "Neither 'xml' nor 'html'"
  556. def ls(path, referer):
  557.     '''
  558.     '''
  559.     compressout.write_h(html_page)
  560.     compressout.write_h('\n')
  561.     if os.getenv('REQUEST_METHOD') == 'HEAD':
  562.         return
  563.     compressout.write_b('''__HTML5NC__''')
  564.     compressout.write_b(htmlescape.escape('''
  565.         <link rel="stylesheet" type="text/css" href="/read/style.css"/>
  566.         <title>Index of {name}</title>
  567.         <meta name="robots" content="{robots_follow}, noindex"/>
  568.         <link rel="canonical" href="{canonical_url}?path={name}"/>
  569.     </head>
  570.     <body>
  571.         {navigation}
  572.         <main><div id="content" class="ls">
  573.             <h1 id="title">Index of <span translate="no">{name}</span></h1>
  574.             <p class="read-nav">
  575.                 {isroot_commentout_start}
  576.                     <a href="{my_url}?path={parent_path}{referer_params}">
  577.                         Parent directory
  578.                     </a>
  579.                 {isroot_commentout_end}
  580.                 <a href="{my_url}?sitemap=html">CGIread sitemap</a>
  581.                 <a href="{my_url}">Main page</a>
  582.             </p>
  583.             <table id="ls">
  584.             ''',
  585.             name          =(1, path[rootlen:] + '/'),
  586.             parent_path   =(2, '/'.join(path.split('/')[:-1])[rootlen:]+'/'),
  587.             robots_follow =(2, 'no'*noindex(path)+'follow'),
  588.             navigation    =(0, mk_navigation(
  589.                                 referer,
  590.                                 "Index of "+path[rootlen:]+'/'
  591.                             )),
  592.             referer_params=(2, mk_referer_param(referer)),
  593.             my_url=(0, my_url),
  594.             canonical_url=(0, canonical_url),
  595.             isroot_commentout_start=(0, '<!--'*(path == root)),
  596.             isroot_commentout_end=(0, '-->'*(path == root)),
  597.         ))
  598.     no_access = conf['noaccess'] + conf['hide'] + conf['topsecret']
  599.     
  600.     for x in sorted(os.listdir(path)):
  601.         full_path = os.path.join(path, x)
  602.         
  603.         forbidden = False
  604.         for regex in no_access:
  605.             if re.match(regex, full_path[rootlen:]) is not None:
  606.                 forbidden = True
  607.                 break
  608.         if forbidden:
  609.             continue
  610.         
  611.         #url = cgi.escape(full_path, quote=True)
  612.         try:
  613.             os.listdir(full_path)
  614.             is_dir = 1
  615.         except:
  616.             is_dir = 0
  617.         # mobile_desc
  618.         # desktop_desc
  619.         if is_dir:
  620.            mobile_desc = '<span class="yeah">-&gt;</span>'
  621.            desktop_desc = '<span class="yeah">Directory</span>'
  622.         else:
  623.             content = open(full_path).read()
  624.             try:
  625.                 content.decode('UTF-8')
  626.                 binary = False
  627.             except:
  628.                 binary = True
  629.             if binary:
  630.                 desktop_desc = 'Binary'
  631.                 mobile_desc = ':-('
  632.             else:
  633.                 good, title, meta_d, onpage_d = mk_description(full_path)
  634.                 if good == 2:
  635.                     desktop_desc = htmlescape.escape(
  636.                         '<span class="thenumberofthebeast">{}</span>',
  637.                         1, meta_d
  638.                     )
  639.                     if noindex(full_path):
  640.                         mobile_desc = '<span class="yeah">:-)</span>'
  641.                     else:
  642.                         mobile_desc = '<span class="thenumberofthebeast">:-D</span>'
  643.                 elif not noindex(full_path):
  644.                     mobile_desc = '<span class="yeah">:-)</span>'
  645.                     desktop_desc = '<span class="yeah">Text; indexable</span>'
  646.                 else:
  647.                     mobile_desc = ':-|'
  648.                     desktop_desc = 'Boring; unindexable'
  649.                     
  650.         compressout.write_b(
  651.             htmlescape.escape(
  652.                 '''<tr><td class="mobile">{mobile_desc}</td>
  653.                 <td><a translate="no"
  654.                     href="{site}?path={path}{referer}">{text}</a></td>
  655.                 <td class="desktop">{desktop_desc}</td></tr>
  656.                 ''',
  657.                 site=(0, my_url),
  658.                 path=(2, full_path[rootlen:] + '/'*is_dir),
  659.                 referer=(2, mk_referer_param(referer)),
  660.                 text=(1, x + '/'*is_dir),
  661.                 mobile_desc=(0, mobile_desc),
  662.                 desktop_desc=(0, desktop_desc),
  663.             )
  664.         )
  665.     compressout.write_b('''            <!--</p>--></table>
  666.         </div></main>
  667.         __FOOTER__
  668.     </body>
  669. </html>\n''')
  670. def download(path):
  671.     if noindex(path):
  672.         compressout.write_h('X-Robots-Tag: noindex\n')
  673.     else:
  674.         compressout.write_h('X-Robots-Tag: index\n') # For verbosity.
  675.     content = open(path).read()
  676.     try:
  677.         content.decode('utf-8')
  678.         compressout.write_h('Content-Type: text/plain; charset=UTF-8\n')
  679.         compressout.write_h(htmlescape.escape(
  680.                 'Link: <{}?path={}>',
  681.                 0, canonical_url,
  682.                 2, path[rootlen:]
  683.             ) + '; rel="canonical"; type="text/html"\n'
  684.         )
  685.     except:
  686.         compressout.write_h(htmlescape.escape(
  687.             'Link: <{}?path={}>; rel="canonical"\n',
  688.             0, canonical_url,
  689.             2, path[rootlen:]
  690.         )) # No type specified.
  691.     if if_none_match(path):
  692.         compressout.write_h('\n')
  693.         if os.getenv('REQUEST_METHOD') != 'HEAD':
  694.             compressout.write_b(content)
  695. def cat(path, referer):
  696.     '''
  697.     '''
  698.     def ol_content(text):
  699.         out_lines = []
  700.         ids = []
  701.         allowed_chars = string.letters + '_-'
  702.         for index, line in enumerate(text.split('\n')):
  703.             # Create a "permanent" fragment this line.
  704.             this_id = ''
  705.             # Find ids in Python and XHTML
  706.             for decltype in ('def', 'class'):
  707.                 if line.strip().startswith(decltype + ' ') and '(' in line:
  708.                     this_id = line.split(decltype, 1)[1].split('(')[0].strip()
  709.             if 'id="' in line:
  710.                 this_id = line.split('id="')[1].split('"')[0]
  711.             # Prevent bad ids.
  712.             for ch in this_id:
  713.                 if ch not in allowed_chars:
  714.                     this_id = ''
  715.                     break
  716.             if this_id in ids:
  717.                 this_id = ''
  718.             # Create the fragment identifier for the line.
  719.             if this_id:
  720.                 ids.append(this_id)
  721.                 idline = 'id="content_{}"'.format(this_id)
  722.             else:
  723.                 idline = ''
  724.             # Create line
  725.             out_lines.append(htmlescape.escape(
  726.                     '    <li id="{}"><pre translate="no" {}>{}</pre></li>\n',
  727.                     0, index + 1,
  728.                     0, idline,
  729.                     1, line,
  730.             ))
  731.         fragment_links = []
  732.         for fragment in sorted(ids):
  733.             fragment_links.append(
  734.                 (
  735.                     '<a class="quick" href="#content_{0}" translate="no"' +
  736.                     '>{0}</a>\n'
  737.                 ).format(
  738.                     fragment
  739.                 )
  740.             )
  741.         return ''.join(out_lines), ''.join(fragment_links)
  742.     
  743.     content = open(path).read()
  744.     try:
  745.         content.decode('utf-8')
  746.     except:
  747.         if noindex(path):
  748.             compressout.write_h('X-Robots-Tag: noindex\n')
  749.         else:
  750.             compressout.write_h('X-Robots-Tag: index\n')
  751.         compressout.write_h('\n')
  752.         compressout.write_b(content)
  753.         return
  754.     compressout.write_h(html_page)
  755.     compressout.write_h('\n')
  756.     if os.getenv('REQUEST_METHOD') == 'HEAD':
  757.         return
  758.     
  759.     ignore, title, meta_description, p_description = mk_description(path)
  760.     last_modified = time.strftime('%F', time.gmtime(os.stat(path).st_mtime))
  761.     
  762.     lines, fragment_links = ol_content(content)
  763.     if not fragment_links:
  764.         fragment_links = '(none)'
  765.     
  766.     compressout.write_b('''__HTML5NC__''')
  767.     compressout.write_b('''
  768. <script type="application/ld+json">
  769. {
  770.     "@context":
  771.     {
  772.         "@vocab": "http://schema.org/"
  773.     },
  774.     "@type": "SoftwareSourceCode",
  775.     "license": "https://opensource.org/licenses/BSD-2-Clause",
  776.     "author":
  777.     {
  778.     ''')
  779.     compressout.write_b('''
  780.         "@type": "Person",
  781.         "@id": "__SITE__/",
  782.         "name": "{0}",
  783.         "url": "__SITE__/"
  784.     '''.format(owner))
  785.     compressout.write_b('''
  786.     },
  787.     "publisher": {"@id": "__SITE__/"},
  788.     "copyrightHolder": {"@id": "__SITE__/"},
  789.     ''')
  790.     compressout.write_b('''
  791.     "url": "{}#code",
  792.     "DateModified": "{}"
  793.     '''.format(
  794.         canonical_url + '?path=' + path[rootlen:],
  795.         last_modified,
  796.     ))
  797.     compressout.write_b('''
  798. }
  799. </script>
  800.     ''')
  801.     parent_link = '/'.join(path.split('/')[:-1])[rootlen:]+'/'
  802.     compressout.write_b(htmlescape.escape('''
  803.         <link rel="stylesheet" type="text/css" href="/read/style.css"/>
  804.         <title>{title}</title>
  805.         <link rel="canonical" href="{canonical}"/>
  806.         <link
  807.             rel="alternate"
  808.             href="{canonical}&amp;download=yes"
  809.             type="text/plain"
  810.         />
  811.         <meta name="robots" content="{noindex_no}index"/>
  812.         <meta name="description" content="{meta_description}"/>
  813.     </head>
  814.     <body>
  815.         {navigation}
  816. <main><div id="content">
  817.     <h1 id="title" translate="no">{title}</h1>
  818.     <div id="description">
  819.         {content_description}
  820.     </div>
  821.     <table>
  822.         <tr>
  823.             <td>Last modified</td>
  824.             <td><time datetime="{last_modified}">{last_modified}</time></td>
  825.         </tr>
  826.         <tr>
  827.             <td>Lines</td>
  828.             <td>{linecount}</td>
  829.         </tr>
  830.         <tr>
  831.             <td>Indexable</td>
  832.             <td>{indexable}</td>
  833.         </tr>
  834.     </table>
  835.     <p class="notprint read-nav">
  836.         <a href="{my_url}?path={parent_dir}">Parent directory</a>
  837.         <a href="{my_url}?path={path}&amp;download=yes" target="_blank">Download</a>
  838.         <a href="{my_url}?sitemap=html">CGIread sitemap</a>
  839.         <a href="{my_url}">Main page</a>
  840.     </p>
  841.     <p class="notprint">
  842.         Quick links:\n{fragments}
  843.     </p>
  844. <ol id="code">
  845. {content}
  846. </ol>
  847. </div></main>
  848. ''',
  849.         title=(2, title),
  850.         content=(0, lines),
  851.         parent_dir=(2, parent_link + mk_referer_param(referer)),
  852.         navigation=(0, mk_navigation(referer, path[rootlen:])),
  853.         canonical=(2, canonical_url + '?path=' + path[rootlen:]),
  854.         path=(2, path[rootlen:]),
  855.         noindex_no=(2, 'no' * noindex(path)),
  856.         meta_description=(2, meta_description),
  857.         content_description=(0, p_description),
  858.         last_modified=(2, last_modified),
  859.         linecount=(1, content.count('\n') + 1),
  860.         indexable=(0, {True: 'No', False: 'Yes'}[noindex(path)]),
  861.         fragments=(0, fragment_links),
  862.         my_url=(0, my_url),
  863.     ))
  864.     compressout.write_b('''
  865.         __FOOTER__
  866.     </body>
  867. </html>
  868. ''')
  869. def if_none_match(path):
  870.     '''
  871.     ETag handling for `cat`, `ls` and `download`:
  872.     
  873.     
  874.     Returns `True` if content needs to be generated.
  875.     Outputs necessary headers and 304 statuses.
  876.     '''
  877.     try:
  878.         meta_time = os.stat(path + '.info').st_mtime
  879.     except:
  880.         meta_time = 0
  881.     ETag = '"{}{}-{}({})-{}-({}-{})"'.format(
  882.         'x'*('application/xhtml+xml' in html_page),
  883.         'z'*('gzip' in os.getenv('HTTP_ACCEPT_ENCODING', '')),
  884.         os.stat(path).st_mtime,
  885.         meta_time,
  886.         base64.b64encode(os.getenv('QUERY_STRING', '')),
  887.         os.stat('index.py').st_mtime,
  888.         os.stat('read.cfg').st_mtime,
  889.     )
  890.     compressout.write_h('Vary: If-None-Match\n')
  891.     compressout.write_h('ETag: {}\n'.format(ETag))
  892.     compressout.write_h(
  893. '''X-ETag-Synopsis: [x][z]-<f_time>(<m_time>)-<query>-(<s_time>-<c_time>)
  894. X-ETag-Description-x: "Client accepts application/xhtml+xml"
  895. X-ETag-Description-z: "Content-Encoding: gzip"
  896. X-ETag-Description-f_time: "Unix last modified time for the requested file"
  897. X-ETag-Description-m_time: "Unix last modified time for the file's metadata"
  898. X-ETag-Description-query: "base64 encoded $QUERY_STRING"
  899. X-ETag-Description-s_time: "Unix last modified time for '/read/index.py'"
  900. X-ETag-Description-c_time: "Unix last modified time for '/read/read.cfg'"
  901. ''')
  902.     if os.getenv('HTTP_IF_NONE_MATCH', '') == ETag:
  903.         compressout.write_h('Status: 304\n\n')
  904.         return False
  905.     else:
  906.         return True
  907. def is_injection_attempt(path_param, referer_URI, referer_title):
  908.     '''
  909.     Various checks to see if any form of injection attempt has been
  910.     made.  This function checks the `path`, `referer` and `title`
  911.     parameters.
  912.     
  913.     Returns True if the request is an injection attempt.
  914.     
  915.     - XSS
  916.     - URL injection
  917.     - Spam injection
  918.     - Restricted files access
  919.     '''
  920.     # If the path parameter contains an XSS attempt, it can't be corrected
  921.     evil = False
  922.     # Prevent attacks.
  923.     if '..' in path_param:
  924.         return True
  925.     for var in referer_URI, referer_title:
  926.         for ch in var:
  927.             if ord(ch) < 32:
  928.                 return True
  929.             if ch in '<>&\'"':
  930.                 return True
  931.             # NOTICE: The following will limit parameters to ASCII.
  932.             if ord(ch) > 126:
  933.                 return True
  934.     # Prevent linking to Mallory.
  935.     for start in ('http://', 'https://', '//', 'ftp://'):
  936.         if referer_URI.startswith(start):
  937.             hostname = referer_URI.split('//')[1].split('/')[0]
  938.             if hostname not in conf['allowed-referer-hosts']:
  939.                 return True
  940.             else:
  941.                 break
  942.     else:
  943.         if ':' in referer_URI:
  944.             return True
  945.     # Prevent injected spam
  946.     if spammy.spammy(referer_title) or len(referer_title) > 42:
  947.         return True
  948.     # No match.
  949.     return False
  950. def handle_injection_attempt(path_param, referer_URI, referer_title):
  951.     '''
  952.     Decide if the injection attempt was due to innocently following
  953.     a malicious link or due to creating one.
  954.     '''
  955.     # Check if the URL can be sanitized.
  956.     if is_injection_attempt(path_param, '', ''):
  957.         destination = 'https://en.wikipedia.org/wiki/Data_validation'
  958.     else:
  959.         destination = my_url + '?path=' + path_param
  960.     redirect_spam(destination)
  961. def main():
  962.     '''
  963.     `compressout.init` MUST be called before `main`
  964.     and `compressout.done` after.
  965.     '''
  966.     # HTML vs XHTML
  967.     global html_page
  968.     html_page = 'Vary: Accept\n'
  969.     if 'application/xhtml+xml' in os.getenv('HTTP_ACCEPT', ''):
  970.         html_page += 'Content-Type: application/xhtml+xml; charset=UTF-8\n'
  971.     else:
  972.         html_page += 'Content-Type: text/html; charset=UTF-8\n'
  973.     # Check that the method is either GET, HEAD or OPTIONS.
  974.     if os.getenv('REQUEST_METHOD') not in ('GET', 'HEAD'):
  975.         if os.getenv('REQUEST_METHOD') != 'OPTIONS':
  976.             compressout.write_h('Status: 405\n')
  977.         compressout.write_h('Allow: GET, HEAD, OPTIONS\n')
  978.         compressout.write_h('Content-Type: text/plain\n')
  979.         compressout.write_h('\n')
  980.         if os.getenv('REQUEST_METHOD') != 'OPTIONS':
  981.             compressout.write_b('Method not allowed!\n')
  982.         compressout.write_b('Allowed methods: GET, HEAD, OPTIONS\n')
  983.         return
  984.     # Get the parameters.
  985.     params = cgi.FieldStorage()
  986.     path = path_param = params.getfirst('path', default='')
  987.     referer_URI = params.getfirst('referer', default='')
  988.     referer_title = params.getfirst('title', default='Back')
  989.     referer = (referer_URI, referer_title)
  990.     download_flag = params.getfirst('download', default='no')
  991.     sitemap_param = params.getfirst('sitemap', default='none')
  992.     
  993.     if not os.getenv('QUERY_STRING'):
  994.         index_page()
  995.         return
  996.         
  997.     # Bad request, but will match the evil patterns.
  998.     # Keep it before the evil stopper.
  999.     if bool(path_param) and not path_param.startswith('/'):
  1000.         status400('`path` is not relative to this site. (No leading slash.)')
  1001.         return
  1002.     
  1003.     # Do not allow evil requests.
  1004.     allow = True
  1005.     # Keep things within the server root.
  1006.     try:
  1007.         path = os.path.realpath(root + path)
  1008.     except:
  1009.         allow = False
  1010.     if path != root and not path.startswith(root + '/'):
  1011.         allow = False
  1012.     # Stop at forbidden paths. #1/2
  1013.     for regex in conf['noaccess']:
  1014.         if re.match(regex, path[rootlen:]) is not None:
  1015.             allow = False
  1016.     
  1017.     # Prevent XSS, URL injection, spam injection and miscellaneous assholery.
  1018.     if is_injection_attempt(path_param, referer_URI, referer_title):
  1019.         allow = False
  1020.     if not allow:
  1021.         handle_injection_attempt(path_param, referer_URI, referer_title)
  1022.         return
  1023.     
  1024.     # Bad requests:
  1025.     if download_flag not in ('yes', 'no'):
  1026.         status400('`download` MUST be "yes", "no" or unset.')
  1027.         return
  1028.     if bool(path_param) and sitemap_param != 'none':
  1029.         status400('The `sitemap` parameter cannot be used with any other.')
  1030.         return
  1031.     if download_flag == 'yes' and bool(referer_URI):
  1032.         status400("`download=yes` can't be used with the `referer` parameter.")
  1033.         return
  1034.     if sitemap_param not in ('none', 'xml', 'html'):
  1035.         status400('`sitemap` MUST be "html", "xml" or unset.')
  1036.         return
  1037.     if download_flag == 'yes' and not bool(path_param):
  1038.         status400('Nothing to `download`. Use the `path` parameter.')
  1039.         return
  1040.     if bool(referer_URI) and not bool(path_param):
  1041.         status400('`referer` cannot be used without `path`')
  1042.         return
  1043.     if referer_title != 'Back' and not bool(referer_URI):
  1044.         status400('`referer` is not set.')
  1045.         return
  1046.     
  1047.     if allow:
  1048.     # Generate sitemap?
  1049.         if sitemap_param != 'none':
  1050.             sitemap(sitemap_param)
  1051.         else:
  1052.             # Stop at forbidden paths. #2/2
  1053.             for regex in conf['topsecret']:
  1054.                 if re.match(regex, path[rootlen:]) is not None:
  1055.                     status404()
  1056.                     break
  1057.             else:
  1058.                 # Allowed to be seen.
  1059.                 try:
  1060.                     os.listdir(path)
  1061.                     if download_flag == 'no':
  1062.                         if if_none_match(path):
  1063.                             ls(path, referer)
  1064.                     else:
  1065.                         status400("Can't download a directory.")
  1066.                 except OSError as e:
  1067.                     if e.errno == errno.ENOTDIR:
  1068.                         if download_flag == 'no':
  1069.                             if if_none_match(path):
  1070.                                 cat(path, referer)
  1071.                         else:
  1072.                             # `download` sets a few headers.
  1073.                             download(path)
  1074.                     elif e.errno == errno.ENOENT:
  1075.                         status404()
  1076.                     else:
  1077.                         raise ValueError(
  1078.                             'errno must be either ENOTDIR or ENOENT'
  1079.                         )
  1080. if __name__ == '__main__':
  1081.     compressout.init()
  1082.     main()
  1083.     compressout.done()