CGI source code reader script

This is (the source of) the script that generates this very page.

Through this, you can see the source code for all the scripts on my site.

    Requirements for a file for its /read/ page to be indexable:
  • Always indexable if whitelisted
  • Not manually blacklisted
  • Not made from another file
  • Text file
  • At least 3072/1536/1024 Unicode code points
  • At least 300/150/100 "words"
  • At least 60/30/20 lines
  • At least 24/12/8 comments
Last modified
Lines 1148
Indexable Yes

Parent directory Download CGIread sitemap Main page

Quick links: cat code content description download forms handle_injection_attempt if_none_match index_page is_injection_attempt isword ls main mk_description mk_navigation mk_referer_param navigation noindex ol_content redirect_spam sitemap syntax title

  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. root = '/var/www'
  4. owner = 'Oskar Skog'
  5. my_url = '/read/'
  6. canonical_url = 'https://__HOST__/read/'
  7. html403file = '/var/www/oops/403.html'
  8. html404file = '/var/www/oops/404.html'
  9. html503file = '/var/www/oops/cgi503.html'
  10. import sys
  11. sys.path.append(root)
  12. import cgi
  13. import os
  14. import errno
  15. import compressout
  16. import base64
  17. import re
  18. import time
  19. import htmlescape
  20. import string
  21. import spammy
  22. import sitemap as mod_sitemap  # Name conflict with already existing function.
  23. import cgitb
  24. cgitb.enable()
  25. rootlen = len(root)
  26. #html_mime = 'text/html'      # Set to XHTML later.
  27. html_page = 'Content-Type: text/html; charset=UTF-8\n'  # Set to XHTML later.
  28. conf = eval(open('read.cfg').read())
  29. def redirect_spam(destination):
  30.     '''`destination` is the URL to which assholes should be redirected.'''
  31.     compressout.write_h('Status: 303\n')
  32.     compressout.write_h('Location: {}\n'.format(destination))
  33.     compressout.write_h('\n')
  34. def status400(message):
  35.     '''HTTP 400; `message` goes UNESCAPED inside a <pre> element.'''
  36.     compressout.write_h('Status: 400\n')
  37.     compressout.write_h(html_page)
  38.     compressout.write_h('\n')
  39.     compressout.write_b('''__HTML5__
  40.         <title>400 - Bad Request</title>
  41.     </head>
  42.     <body>
  43.         __NAVIGATION__
  44.         <main><div id="content">
  45.             <h1 id="title">400 - Bad Request</h1>
  46.             <pre>{}</pre>
  47.             <p>
  48.                 Your request can't be understood.
  49.                 Check the parameters.
  50.             </p>
  51.             <p><a href="/read/">Documentation for the parameters</a></p>
  52.         </div></main>
  53. '''.format(message))
  54.     compressout.write_b('''
  55.         __FOOTER__
  56.     </body>
  57. </html>''')
  58. def status403():
  59.     '''HTTP 403'''
  60.     compressout.write_h(html_page)
  61.     compressout.write_h('Status: 403\n\n')
  62.     compressout.write_b(open(html403file).read())
  63. def status404():
  64.     '''HTTP 404'''
  65.     compressout.write_h('Status: 404\n')
  66.     compressout.write_h(html_page)
  67.     compressout.write_h('\n')
  68.     compressout.write_b(open(html404file).read())
  69. def status503():
  70.     '''
  71.     HTTP 503
  72.     
  73.     Call this if there is too much load on the server to do something.
  74.     (Used by the sitemap function.)
  75.     '''
  76.     compressout.write_h('Status: 503\n')
  77.     compressout.write_h(html_page)
  78.     # One factor is load avg for 1 minute, add some slop to the delay for bots.
  79.     compressout.write_h('Retry-After: 90\n')
  80.     compressout.write_h('\n')
  81.     compressout.write_b(open(html503file).read())
  82. def index_page():
  83.     '''https://oskog97.com/read/'''
  84.     # Handle 304s.
  85.     ETag = '"{}{}{}"'.format(
  86.         'x'*('application/xhtml+xml' in html_page),
  87.         'z'*('gzip' in os.getenv('HTTP_ACCEPT_ENCODING', '')),
  88.         os.stat('index.py').st_mtime,
  89.     )
  90.     compressout.write_h('Vary: If-None-Match\n')
  91.     compressout.write_h('ETag: {}\n'.format(ETag))
  92.     compressout.write_h(html_page)
  93.     if os.getenv('HTTP_IF_NONE_MATCH') == ETag:
  94.         compressout.write_h('Status: 304\n\n')
  95.         return
  96.     compressout.write_h('\n')
  97.     if os.getenv('REQUEST_METHOD') == 'HEAD':
  98.         return
  99.     # Write out a static page.
  100.     compressout.write_b('''__HTML5__
  101.     <!-- With canonical link tag. -->
  102.         <link rel="stylesheet" type="text/css" href="/read/style.css"/>
  103.         <meta name="description" content="Interested in the scripts I have
  104.         on my website? Come and take a look at them."/>
  105.         __TITLE__
  106.     </head>
  107.     <body>
  108.         __NAVIGATION__
  109.         <main><div id="content">
  110.             __H1__
  111.     ''')
  112.     compressout.write_b('''
  113.             <p>
  114.                 Interested in the scripts I have on my website?
  115.                 Go take a look at them; start crawling the
  116.                 <a href="{0}?path=/">root directory</a> or take a look
  117.                 at the <span class="a"><a href="{0}?sitemap=html"
  118.                 >(sub)sitemap</a>.</span>
  119.             </p>
  120.             <div id="syntax">
  121.                 <h2>Parameter syntax</h2>
  122.                 <p>
  123.                     Descriptions for the parameters can be found in
  124.                     the request forms.
  125.                 </p>
  126.                 <ul>
  127.                     <li>
  128.                         Asterisks <q>*</q> represent a value that can be
  129.                         (almost) anything.
  130.                     </li>
  131.                     <li>Square brackets <q>[]</q> represent optional.</li>
  132.                     <li>Curly brackets <q>&#x7b;&#x7d;</q> represent mandatory.</li>
  133.                     <li>Pipes <q>|</q> represent either or.</li>
  134.                 </ul>
  135.                 <p>There are three acceptable "sets" of parameters:</p>
  136.                 <ol>
  137. <li><pre>{0}?sitemap=&#x7b;html|xml&#x7d;</pre></li>
  138. <li><pre>{0}?path=*[&amp;download=yes]</pre></li>
  139. <li><pre>{0}?path=*[&amp;referer=*[&amp;title=*]]</pre></li>
  140.                 </ol>
  141.                 <p>
  142.                     The order of the valid parameters doesn't matter, but
  143.                     this is the recommended/canonical order.
  144.                 </p>
  145.             </div>
  146.             <div id="forms">
  147.                 <h2>Request forms</h2>
  148.                 <p><strong>
  149.                     Notice that these are three different forms.
  150.                 </strong></p>
  151.                 <form action="{0}" method="get">
  152.                 <h3>Sitemap</h3>
  153.                 <p>
  154.                     The <code>sitemap</code> parameter can be either
  155.                     <q><code>html</code></q>, <q><code>xml</code></q>
  156.                     or the default <q><code>none</code></q>.
  157.                     It can't be used together with any other parameters.
  158.                 </p>
  159.                 <p>
  160.                     <input type="radio" name="sitemap" value="html"/>
  161.                     Request an HTML sitemap instead of a page<br/>
  162.                     <input type="radio" name="sitemap" value="xml"/>
  163.                     request an XML sitemap instead of a page<br/>
  164.                     <input type="submit"/>
  165.                 </p>
  166.                 </form>
  167.                 <form action="{0}" method="get">
  168.                 <h3>Page</h3>
  169.                 <p>
  170.                     A page (source code of a CGI script) is selected with the
  171.                     <code>path</code> parameter.  The value of the
  172.                     <code>path</code> parameter is a URL relative to this
  173.                     site, ie. an URL beginning with a single slash.
  174.                 </p>
  175.                 <p>
  176.                     The <code>path</code> is the site-local URL to the CGI
  177.                     script or directory you're interested in.  If you set the
  178.                     value to <q><code>/read/index.py</code></q>, you'll get the
  179.                     source code for this script. And if you set it to
  180.                     <q><code>/</code></q>, you'll get a directory listing
  181.                     of the site's root directory.
  182.                 </p>
  183.                 <p>
  184.                     Path/URL: <input type="text" name="path" value="/"/>
  185.                     <input type="submit"/><br/>
  186.                     <input type="checkbox" name="download" value="yes"/>
  187.                     Download / see it as plain text
  188.                     
  189.                 </p>
  190.                 <p>
  191.                     The <code>download</code> parameter can be set to either
  192.                     <q><code>yes</code></q> or the default
  193.                     <q><code>no</code></q>.  The download option does
  194.                     obviously not work with directories.
  195.                 </p>
  196.                 </form>
  197.                 <form action="{0}" method="get">
  198.                 <h3>Link back to a referencing page</h3>
  199.                 <p>
  200.                     If <code>download</code> is <q><code>no</code></q> or
  201.                     unset and a page (not a sitemap) was requested, it is
  202.                     possible to change the navigation to make the requested
  203.                     page link back to a referring page.
  204.                 </p>
  205.                 <p>
  206.                     The <code>referer</code> (yes, misspelled like the HTTP
  207.                     Referer) parameter is the URL of the referencing page.
  208.                     (Don't try to specify a site that isn't mine.)
  209.                     The <code>title</code> parameter gives the back link a
  210.                     different text than <q>Back</q>.
  211.                 </p>
  212.                 <table>
  213.                     <tr>
  214.                         <th><code>path</code></th>
  215.                         <td><input type="text" name="path" value="/"/></td>
  216.                     </tr>
  217.                     <tr>
  218.                         <th><code>referer</code></th>
  219.                         <td><input type="text" name="referer"/></td>
  220.                     </tr>
  221.                     <tr>
  222.                         <th><code>title</code></th>
  223.                         <td><input type="text" name="title"/></td>
  224.                     </tr>
  225.                     <tr>
  226.                         <td></td>
  227.                         <td><input type="submit"/></td>
  228.                     </tr>
  229.                 </table>
  230.                 </form>
  231.             </div>
  232.         </div></main>
  233.     '''.format(my_url))
  234.     compressout.write_b('''
  235.         __FOOTER__
  236.     </body>
  237. </html>
  238. ''')
  239. def noindex(path):
  240.     '''
  241.     Returns True if `path` should be noindexed.
  242.     
  243.     `path` is an absolute **filesystem** path.
  244.     '''
  245.     def isword(w):
  246.         letters = string.ascii_letters + ',.'
  247.         for ch in w:
  248.             if w not in letters:
  249.                 return False
  250.         return True
  251.     # 1. White list
  252.     # 2. Black list
  253.     # 3. Page quality (not applicable for directories)
  254.     
  255.     # Check whitelist first.
  256.     for regex in conf['doindex']:
  257.         if re.match(regex, path[rootlen:]) is not None:
  258.             return False
  259.             break
  260.     
  261.     # Blacklist (two kinds):
  262.     # - Generated from another file.
  263.     # - Explicitly blacklisted in 'read.cfg'.
  264.     for match, replace in conf['madefrom']:
  265.         if re.match(match, path[rootlen:]) is not None:
  266.             try:
  267.                 os.stat(root + re.sub(match, replace, path[rootlen:]))
  268.                 return True
  269.             except:
  270.                 pass
  271.     for regex in conf['noindex'] + conf['hide']:
  272.         if re.match(regex, path[rootlen:]) is not None:
  273.             return True
  274.     
  275.     # Quality:
  276.     #   - Text file
  277.     #   - At least 3072 Unicode code points
  278.     #   - At least 300 words
  279.     #   - At least 60 lines
  280.     #   - Half the limitations if a meta description and title is found
  281.     #   - A third of the limimitations if an onpage description is found
  282.     try:
  283.         os.listdir(path)
  284.         return False
  285.     except:
  286.         pass
  287.     # Normal file.
  288.     try:
  289.         if sys.version_info[0] > 2:
  290.             text = open(path).read()
  291.         else:
  292.             text = open(path).read().decode('utf-8')
  293.     except:
  294.         return True
  295.     min_chars, min_words, min_lines, min_comments = 3072, 300, 60, 24
  296.     quality = mk_description(path)[0] + 1
  297.     min_chars //= quality; min_words //= quality
  298.     min_lines //= quality; min_comments //= quality
  299.     if len(text) < min_chars:
  300.         return True
  301.     if text.count('\n') + 1 < min_lines:
  302.         return True
  303.     n_comments = 0
  304.     is_comment = re.compile('^(.*#.*| *\\* .*|.*<!--.*|.*\'\'\'.*)$')
  305.     for line in text.split('\n'):
  306.         if re.match(is_comment, line) is not None:
  307.             n_comments += 1
  308.     if n_comments < min_comments:
  309.         return True
  310.     if len(list(filter(isword, text.replace('\n', ' ').split(' ')))) < min_words:
  311.         return True
  312.     # Passed the quality tests:
  313.     return False
  314. def mk_navigation(referer, title):
  315.     '''
  316.     Returns a string which is the navigation bar's HTML.
  317.     
  318.     `title` is the title of the requested page.
  319.     
  320.     `referer` is used to **optionally** ``integrate`` a page.
  321.     `referer` is a tuple of (URL, title) for the "back" link.
  322.     '''
  323.     if referer[0]:
  324.         return htmlescape.escape('''<!-- Navigation generated by CGIread. -->
  325. <nav><div id="navigation"><div id="nav_inner">
  326. <p><a href="#content" class="textonly">Skip navigation</a></p>
  327. <p class="row">
  328. <span class="textonly" translate="no">[</span><a class="head" href="{URL}">{title}</a><span class="textonly" translate="no">]</span>
  329. &gt;&gt;
  330. <span class="textonly" translate="no">]</span><span class="sub active">{me}</span><span class="textonly" translate="no">[</span>
  331. <span class="textonly" translate="no">[</span><a class="sub" href="{my_url}?sitemap=html">Sitemap for website's scripts</a><span class="textonly" translate="no">]</span>
  332. </p>
  333. <p class="row">
  334. <span class="textonly" translate="no">[</span><a class="head" href="/">Home</a><span class="textonly" translate="no">]</span>
  335. &gt;&gt;
  336. <span class="textonly" translate="no">[</span><a class="sub" href="/read/">Website's scripts</a><span class="textonly" translate="no">]</span>
  337. <span class="textonly" translate="no">[</span><a class="sub" href="/pages/policy.html">Privacy policy &amp; terms of use</a><span class="textonly" translate="no">]</span>
  338. <span class="textonly" translate="no">[</span><a class="sub" href="/sitemap.py">Sitemap</a><span class="textonly" translate="no">]</span>
  339. </p>
  340. <hr class="textonly"/>
  341. </div></div></nav>
  342. <!-- End of navigation. -->''',
  343.             URL=(2, referer[0]),
  344.             title=(1, referer[1]),
  345.             me=(1, title),
  346.             my_url=(0, my_url),
  347.         )
  348.     else:
  349.         return '''__NAVIGATION__'''
  350. def mk_referer_param(referer):
  351.     '''Returns one of:
  352.         ''
  353.         '&referer=' + referer[0]
  354.         '&referer=' + referer[0] + '&title=' + referer[1]
  355.     to be added to links from the requested page.
  356.     
  357.     `referer` is used to **optionally** ``integrate`` a page.
  358.     See `mk_navigation`
  359.     '''
  360.     if referer[0]:
  361.         if referer[1] != 'Back':
  362.             title = '&title={}'.format(referer[1])
  363.         else:
  364.             title = ''
  365.         return '&referer={}{}'.format(referer[0], title)
  366.     else:
  367.         return ''
  368. def mk_description(path):
  369.     '''
  370.     Return three strings: (good, title, meta_description, onpage_description)
  371.     
  372.     `path` is the absolute filesystem path to the requested page.
  373.     
  374.     `good` is
  375.         0       no title and description
  376.         1       title and meta description only
  377.         2       also an onpage description
  378.     
  379.     `title` is the title of the page.
  380.     
  381.     `meta_description` is the content of the description meta tag.
  382.     
  383.     `onpage_description` is HTML content for the onpage description.
  384.     requested page.
  385.     '''
  386.     good = 0
  387.     title = "source code of {}".format(path[rootlen:])
  388.     meta_description = ''
  389.     onpage_description = None
  390.     try:
  391.         content = open(path + '.info').read().split('\n')
  392.         good = 1
  393.     except:
  394.         pass
  395.     if good:
  396.         title = content[0]
  397.         try:
  398.             sep = content.index('.')
  399.         except ValueError:
  400.             sep = None
  401.         if sep is not None:
  402.             good = 2
  403.             meta_description = '\n'.join(content[1:sep])
  404.             onpage_description = '\n'.join(content[sep+1:])
  405.         else:
  406.             meta_description = '\n'.join(content[1:])
  407.     if onpage_description is None:
  408.         onpage_description = htmlescape.escape('<p>{}</p>',1,meta_description)
  409.     return good, title, meta_description, onpage_description
  410. def sitemap(sitemap_type):
  411.     '''
  412.     Write out an XML or HTML sitemap.
  413.     sitemap_type in ('xml', 'html')
  414.     
  415.     The XML sitemap will exclude entries from `conf['noxmlsitemap']`.
  416.     '''    
  417.     
  418.     if os.getenv('REQUEST_METHOD') != 'HEAD': # NOTICE
  419.         # Prevent over-revving the server.
  420.         # HEAD requests are basically no-ops.
  421.         maxload = conf['sitemap-maxload']
  422.         if os.getloadavg()[0] > maxload['load-avg1']:
  423.             status503()
  424.             return
  425.         try:
  426.             access_times = list(map(
  427.                 float, open('read.throttlecontrol').read().strip().split(':')
  428.             ))
  429.         except:
  430.             access_times = [0]
  431.         if time.time() - access_times[-1] < maxload['throttle-time']:
  432.             status503()
  433.             return
  434.         access_times.insert(0, time.time())
  435.         access_times = access_times[:maxload['throttle-requests']]
  436.         f = open('read.throttlecontrol', 'w')
  437.         f.write(':'.join(list(map(str, access_times))) + '\n')
  438.         f.close()
  439.     # Write headers before doing anything else.
  440.     # A HEAD request doesn't need to know the length (it's TE chunked).
  441.     if sitemap_type == 'xml':
  442.         compressout.write_h('Content-Type: application/xml; charset=UTF-8\n')
  443.         compressout.write_h(
  444.             'Link: <{my_url}?sitemap=html>'.format(my_url=canonical_url) +
  445.             '; rel="canonical"' +
  446.             '; type="text/html"\n'
  447.         )
  448.         compressout.write_h('X-Robots-Tag: noindex\n\n') # NOTE: last.
  449.     elif sitemap_type == 'html':
  450.         compressout.write_h(html_page)
  451.         compressout.write_h('\n')
  452.     else:
  453.         assert False, "Neither 'xml' nor 'html'"
  454.     if os.getenv('REQUEST_METHOD') == 'HEAD': # NOTICE
  455.         return
  456.     
  457.     # Find the pages worth being in the sitemap.
  458.     no_access = conf['noaccess'] + conf['hide'] + conf['topsecret']
  459.     paths = []
  460.     
  461.     for basedir, dirs, files in os.walk(root, topdown=True):
  462.         # Exclude hidden directories:
  463.         remove_list = []
  464.         sys.stderr.write('In {}\n'.format(basedir))
  465.         sys.stderr.write('Dirs: {}\n'.format(repr(dirs)))
  466.         for dirname in dirs:
  467.             dirpath = os.path.join(basedir, dirname)[rootlen:]
  468.             for regex in no_access:
  469.                 if re.match(regex, dirpath) is not None:
  470.                     #dirs.remove(dirname)
  471.                     # BUG: The for loop will skip items in the list if
  472.                     # other items are removed while looping.
  473.                     # This caused some real' nasty stuff like sshin to
  474.                     # be crawled, took a whopping .65 seconds.
  475.                     remove_list.append(dirname)
  476.                     break
  477.         sys.stderr.write('Removed dirs: {}\n'.format(repr(remove_list)))
  478.         for dirname in remove_list:
  479.             dirs.remove(dirname)
  480.         
  481.         # Iterate over files:
  482.         for filename in files:
  483.             filepath = os.path.join(basedir, filename)
  484.             # No symlinks allowed.
  485.             #if os.stat(filepath).st_mode == os.lstat(filepath).st_mode:
  486.             if not os.path.islink(filepath):
  487.                 #try:
  488.                     description = mk_description(filepath)
  489.                     if description[0]:
  490.                         # Only indexable content allowed.
  491.                         if not noindex(filepath):
  492.                             paths.append((filepath[rootlen:], description[3]))
  493.                         else:
  494.                             sys.stderr.write('{} is noindexed\n'.format(filepath))
  495.                     else:
  496.                         sys.stderr.write('{} has no description\n'.format(filepath))
  497.                 #except IOError as error:
  498.                     #assert error.errno in (
  499.                         #errno.EISDIR, errno.EACCES
  500.                     #), error.errno
  501.             else:
  502.                 sys.stderr.write('{} is link\n'.format(filepath))
  503.     
  504.     paths.sort(key=lambda x: x[0])
  505.     
  506.     # Print the body.
  507.     if sitemap_type == 'xml':
  508.         compressout.write_b('''<?xml version="1.0" encoding="UTF-8"?>
  509. <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  510. ''')
  511.         #
  512.         for path, description in paths:
  513.             # Loop through all the regexes:
  514.             for regex in conf['noxmlsitemap']:
  515.                 if re.match(regex, path) is not None:
  516.                     break
  517.             else:
  518.                 compressout.write_b(htmlescape.escape('''<url>
  519.     <loc>{canonical_url}?path={path}</loc>
  520.     <priority>0.5</priority>
  521. ''',
  522.                     canonical_url=(0, canonical_url),
  523.                     path=(1, path),
  524.                 ))
  525.                 mod_sitemap.lastmod_changefreq(
  526.                     root + path,
  527.                     compressout,
  528.                 )
  529.                 compressout.write_b('</url>\n')
  530.         #
  531.         compressout.write_b('</urlset>\n')
  532.     elif sitemap_type == 'html':
  533.         compressout.write_b('''__HTML5NC__
  534.         <link rel="canonical" href="{canonical_url}?sitemap=html"/>
  535.         <link rel="alternate" href="{canonical_url}?sitemap=xml"
  536.             type="application/xml"/>
  537.         <meta name="robots" content="noindex, follow"/>
  538.         <title>Sitemap for scripts' source code</title>
  539.         <meta name="description" content="
  540.             Sitemap of all scripts available through /read/.
  541.         "/>
  542.     </head>
  543.     <body>
  544.         __NAVIGATION__
  545.         <main><div id="content" class="sitemap">
  546.             <h1 id="title">Sitemap for scripts' source code</h1>
  547.             <p><a href="{my_url}?path=/">Root directory</a></p>
  548.             <dl>
  549. '''.format(my_url=my_url, canonical_url=canonical_url))
  550.         #
  551.         indent = 16 * ' '
  552.         for path, description in paths:
  553.             compressout.write_b(indent + htmlescape.escape(
  554.                 '''<dt><a translate="no" href="{my_url}?path={path}">
  555.                     {path}
  556.                 </a></dt>\n''',
  557.                 path=(0, path),
  558.                 my_url=(0, canonical_url),
  559.             ))
  560.             compressout.write_b(indent +
  561.                 htmlescape.escape('<dd>{}</dd>\n', 0, description)
  562.             )
  563.         #
  564.         compressout.write_b('''            </dl>
  565.         </div></main>
  566.         __FOOTER__
  567.     </body>
  568. </html>
  569. ''')
  570.     else:
  571.         assert False, "Neither 'xml' nor 'html'"
  572. def ls(path, referer):
  573.     '''
  574.     '''
  575.     compressout.write_h(html_page)
  576.     compressout.write_h('\n')
  577.     if os.getenv('REQUEST_METHOD') == 'HEAD':
  578.         return
  579.     compressout.write_b('''__HTML5NC__''')
  580.     compressout.write_b(htmlescape.escape('''
  581.         <link rel="stylesheet" type="text/css" href="/read/style.css"/>
  582.         <title>Index of {name}</title>
  583.         <meta name="robots" content="{robots_follow}, noindex"/>
  584.         <link rel="canonical" href="{canonical_url}?path={name}"/>
  585.     </head>
  586.     <body>
  587.         {navigation}
  588.         <main><div id="content" class="ls">
  589.             <h1 id="title">Index of <span translate="no">{name}</span></h1>
  590.             <p class="read-nav">
  591.                 {isroot_commentout_start}
  592.                     <a href="{my_url}?path={parent_path}{referer_params}">
  593.                         Parent directory
  594.                     </a>
  595.                 {isroot_commentout_end}
  596.                 <a href="{my_url}?sitemap=html">CGIread sitemap</a>
  597.                 <a href="{my_url}">Main page</a>
  598.             </p>
  599.             <table id="ls">
  600.             ''',
  601.             name          =(1, path[rootlen:] + '/'),
  602.             parent_path   =(2, '/'.join(path.split('/')[:-1])[rootlen:]+'/'),
  603.             robots_follow =(2, 'no'*noindex(path)+'follow'),
  604.             navigation    =(0, mk_navigation(
  605.                                 referer,
  606.                                 "Index of "+path[rootlen:]+'/'
  607.                             )),
  608.             referer_params=(2, mk_referer_param(referer)),
  609.             my_url=(0, my_url),
  610.             canonical_url=(0, canonical_url),
  611.             isroot_commentout_start=(0, '<!--'*(path == root)),
  612.             isroot_commentout_end=(0, '-->'*(path == root)),
  613.         ))
  614.     no_access = conf['noaccess'] + conf['hide'] + conf['topsecret']
  615.     
  616.     for x in sorted(os.listdir(path)):
  617.         full_path = os.path.join(path, x)
  618.         
  619.         forbidden = False
  620.         for regex in no_access:
  621.             if re.match(regex, full_path[rootlen:]) is not None:
  622.                 forbidden = True
  623.                 break
  624.         if forbidden:
  625.             continue
  626.         
  627.         #url = cgi.escape(full_path, quote=True)
  628.         try:
  629.             os.listdir(full_path)
  630.             is_dir = 1
  631.         except:
  632.             is_dir = 0
  633.         # mobile_desc
  634.         # desktop_desc
  635.         if is_dir:
  636.            mobile_desc = '<span class="yeah">-&gt;</span>'
  637.            desktop_desc = '<span class="yeah">Directory</span>'
  638.         else:
  639.             try:
  640.                 content = open(full_path).read()        # This fails on Python 3 !!!
  641.                 if sys.version_info[0] == 2:
  642.                     content.decode('UTF-8')
  643.                 binary = False
  644.             except:
  645.                 binary = True
  646.             if binary:
  647.                 desktop_desc = 'Binary'
  648.                 mobile_desc = ':-('
  649.             else:
  650.                 good, title, meta_d, onpage_d = mk_description(full_path)
  651.                 if good == 2:
  652.                     desktop_desc = htmlescape.escape(
  653.                         '<span class="thenumberofthebeast">{}</span>',
  654.                         1, meta_d
  655.                     )
  656.                     if noindex(full_path):
  657.                         mobile_desc = '<span class="yeah">:-)</span>'
  658.                     else:
  659.                         mobile_desc = '<span class="thenumberofthebeast">:-D</span>'
  660.                 elif not noindex(full_path):
  661.                     mobile_desc = '<span class="yeah">:-)</span>'
  662.                     desktop_desc = '<span class="yeah">Text; indexable</span>'
  663.                 else:
  664.                     mobile_desc = ':-|'
  665.                     desktop_desc = 'Boring; unindexable'
  666.                     
  667.         compressout.write_b(
  668.             htmlescape.escape(
  669.                 '''<tr><td class="mobile">{mobile_desc}</td>
  670.                 <td><a translate="no"
  671.                     href="{site}?path={path}{referer}">{text}</a></td>
  672.                 <td class="desktop">{desktop_desc}</td></tr>
  673.                 ''',
  674.                 site=(0, my_url),
  675.                 path=(2, full_path[rootlen:] + '/'*is_dir),
  676.                 referer=(2, mk_referer_param(referer)),
  677.                 text=(1, x + '/'*is_dir),
  678.                 mobile_desc=(0, mobile_desc),
  679.                 desktop_desc=(0, desktop_desc),
  680.             )
  681.         )
  682.     compressout.write_b('''            <!--</p>--></table>
  683.         </div></main>
  684.         __FOOTER__
  685.     </body>
  686. </html>\n''')
  687. def download(path):
  688.     if noindex(path):
  689.         compressout.write_h('X-Robots-Tag: noindex\n')
  690.     else:
  691.         compressout.write_h('X-Robots-Tag: index\n') # For verbosity.
  692.     try:
  693.         content = open(path).read()
  694.         if sys.version_info[0] == 2:
  695.             content.decode('utf-8')
  696.         compressout.write_h('Content-Type: text/plain; charset=UTF-8\n')
  697.         compressout.write_h(htmlescape.escape(
  698.                 'Link: <{}?path={}>',
  699.                 0, canonical_url,
  700.                 2, path[rootlen:]
  701.             ) + '; rel="canonical"; type="text/html"\n'
  702.         )
  703.     except:
  704.         compressout.write_h(htmlescape.escape(
  705.             'Link: <{}?path={}>; rel="canonical"\n',
  706.             0, canonical_url,
  707.             2, path[rootlen:]
  708.         )) # No type specified.
  709.     if if_none_match(path):
  710.         compressout.write_h('\n')
  711.         if os.getenv('REQUEST_METHOD') != 'HEAD':
  712.             compressout.write_b(content)
  713. def cat(path, referer):
  714.     '''
  715.     '''
  716.     def ol_content(text):
  717.         out_lines = []
  718.         ids = []
  719.         allowed_chars = string.ascii_letters + '_-'
  720.         for index, line in enumerate(text.split('\n')):
  721.             # Create a "permanent" fragment this line.
  722.             this_id = ''
  723.             # Find ids in Python and XHTML
  724.             for decltype in ('def', 'class'):
  725.                 if line.strip().startswith(decltype + ' ') and '(' in line:
  726.                     this_id = line.split(decltype, 1)[1].split('(')[0].strip()
  727.             if 'id="' in line:
  728.                 this_id = line.split('id="')[1].split('"')[0]
  729.             # Prevent bad ids.
  730.             for ch in this_id:
  731.                 if ch not in allowed_chars:
  732.                     this_id = ''
  733.                     break
  734.             if this_id in ids:
  735.                 this_id = ''
  736.             # Create the fragment identifier for the line.
  737.             if this_id:
  738.                 ids.append(this_id)
  739.                 idline = 'id="content_{}"'.format(this_id)
  740.             else:
  741.                 idline = ''
  742.             # Create line
  743.             out_lines.append(htmlescape.escape(
  744.                     '    <li id="{}"><pre translate="no" {}>{}</pre></li>\n',
  745.                     0, index + 1,
  746.                     0, idline,
  747.                     1, line,
  748.             ))
  749.         fragment_links = []
  750.         for fragment in sorted(ids):
  751.             fragment_links.append(
  752.                 (
  753.                     '<a class="quick" href="#content_{0}" translate="no"' +
  754.                     '>{0}</a>\n'
  755.                 ).format(
  756.                     fragment
  757.                 )
  758.             )
  759.         return ''.join(out_lines), ''.join(fragment_links)
  760.     
  761.     try:
  762.         content = open(path).read()
  763.         if sys.version_info[0] == 2:
  764.             content.decode('utf-8')
  765.     except:
  766.         if noindex(path):
  767.             compressout.write_h('X-Robots-Tag: noindex\n')
  768.         else:
  769.             compressout.write_h('X-Robots-Tag: index\n')
  770.         compressout.write_h('\n')
  771.         compressout.write_b(content)
  772.         return
  773.     compressout.write_h(html_page)
  774.     compressout.write_h('\n')
  775.     if os.getenv('REQUEST_METHOD') == 'HEAD':
  776.         return
  777.     
  778.     ignore, title, meta_description, p_description = mk_description(path)
  779.     last_modified = time.strftime('%F', time.gmtime(os.stat(path).st_mtime))
  780.     
  781.     lines, fragment_links = ol_content(content)
  782.     if not fragment_links:
  783.         fragment_links = '(none)'
  784.     
  785.     compressout.write_b('''__HTML5NC__''')
  786.     compressout.write_b('''
  787. <script type="application/ld+json">
  788. {
  789.     "@context":
  790.     {
  791.         "@vocab": "http://schema.org/"
  792.     },
  793.     "@type": "SoftwareSourceCode",
  794.     "license": "https://opensource.org/licenses/BSD-2-Clause",
  795.     "author":
  796.     {
  797.     ''')
  798.     compressout.write_b('''
  799.         "@type": "Person",
  800.         "@id": "__SITE__/",
  801.         "name": "{0}",
  802.         "url": "__SITE__/"
  803.     '''.format(owner))
  804.     compressout.write_b('''
  805.     },
  806.     "publisher": {"@id": "__SITE__/"},
  807.     "copyrightHolder": {"@id": "__SITE__/"},
  808.     ''')
  809.     compressout.write_b('''
  810.     "url": "{}#code",
  811.     "DateModified": "{}"
  812.     '''.format(
  813.         canonical_url + '?path=' + path[rootlen:],
  814.         last_modified,
  815.     ))
  816.     compressout.write_b('''
  817. }
  818. </script>
  819.     ''')
  820.     parent_link = '/'.join(path.split('/')[:-1])[rootlen:]+'/'
  821.     compressout.write_b(htmlescape.escape('''
  822.         <link rel="stylesheet" type="text/css" href="/read/style.css"/>
  823.         <title>{title}</title>
  824.         <link rel="canonical" href="{canonical}"/>
  825.         <link
  826.             rel="alternate"
  827.             href="{canonical}&amp;download=yes"
  828.             type="text/plain"
  829.         />
  830.         <meta name="robots" content="{noindex_no}index"/>
  831.         <meta name="description" content="{meta_description}"/>
  832.     </head>
  833.     <body>
  834.         {navigation}
  835. <main><div id="content">
  836.     <h1 id="title" translate="no">{title}</h1>
  837.     <div id="description">
  838.         {content_description}
  839.     </div>
  840.     <table>
  841.         <tr>
  842.             <td>Last modified</td>
  843.             <td><time datetime="{last_modified}">{last_modified}</time></td>
  844.         </tr>
  845.         <tr>
  846.             <td>Lines</td>
  847.             <td>{linecount}</td>
  848.         </tr>
  849.         <tr>
  850.             <td>Indexable</td>
  851.             <td>{indexable}</td>
  852.         </tr>
  853.     </table>
  854.     <p class="notprint read-nav">
  855.         <a href="{my_url}?path={parent_dir}">Parent directory</a>
  856.         <a href="{my_url}?path={path}&amp;download=yes" target="_blank">Download</a>
  857.         <a href="{my_url}?sitemap=html">CGIread sitemap</a>
  858.         <a href="{my_url}">Main page</a>
  859.     </p>
  860.     <p class="notprint">
  861.         Quick links:\n{fragments}
  862.     </p>
  863. <ol id="code">
  864. {content}
  865. </ol>
  866. </div></main>
  867. ''',
  868.         title=(2, title),
  869.         content=(0, lines),
  870.         parent_dir=(2, parent_link + mk_referer_param(referer)),
  871.         navigation=(0, mk_navigation(referer, path[rootlen:])),
  872.         canonical=(2, canonical_url + '?path=' + path[rootlen:]),
  873.         path=(2, path[rootlen:]),
  874.         noindex_no=(2, 'no' * noindex(path)),
  875.         meta_description=(2, meta_description),
  876.         content_description=(0, p_description),
  877.         last_modified=(2, last_modified),
  878.         linecount=(1, content.count('\n') + 1),
  879.         indexable=(0, {True: 'No', False: 'Yes'}[noindex(path)]),
  880.         fragments=(0, fragment_links),
  881.         my_url=(0, my_url),
  882.     ))
  883.     compressout.write_b('''
  884.         __FOOTER__
  885.     </body>
  886. </html>
  887. ''')
  888. def if_none_match(path):
  889.     '''
  890.     ETag handling for `cat`, `ls` and `download`:
  891.     
  892.     
  893.     Returns `True` if content needs to be generated.
  894.     Outputs necessary headers and 304 statuses.
  895.     '''
  896.     try:
  897.         meta_time = os.stat(path + '.info').st_mtime
  898.     except:
  899.         meta_time = 0
  900.     if sys.version_info[0] > 2:
  901.         query_string = os.getenv('QUERY_STRING', '').encode('utf-8')
  902.     else:
  903.         query_string = os.getenv('QUERY_STRING', '')
  904.     ETag = '"{}{}-{}({})-{}-({}-{})"'.format(
  905.         'x'*('application/xhtml+xml' in html_page),
  906.         'z'*('gzip' in os.getenv('HTTP_ACCEPT_ENCODING', '')),
  907.         os.stat(path).st_mtime,
  908.         meta_time,
  909.         base64.b64encode(query_string),
  910.         os.stat('index.py').st_mtime,
  911.         os.stat('read.cfg').st_mtime,
  912.     )
  913.     compressout.write_h('Vary: If-None-Match\n')
  914.     compressout.write_h('ETag: {}\n'.format(ETag))
  915.     compressout.write_h(
  916. '''X-ETag-Synopsis: [x][z]-<f_time>(<m_time>)-<query>-(<s_time>-<c_time>)
  917. X-ETag-Description-x: "Client accepts application/xhtml+xml"
  918. X-ETag-Description-z: "Content-Encoding: gzip"
  919. X-ETag-Description-f_time: "Unix last modified time for the requested file"
  920. X-ETag-Description-m_time: "Unix last modified time for the file's metadata"
  921. X-ETag-Description-query: "base64 encoded $QUERY_STRING"
  922. X-ETag-Description-s_time: "Unix last modified time for '/read/index.py'"
  923. X-ETag-Description-c_time: "Unix last modified time for '/read/read.cfg'"
  924. ''')
  925.     if os.getenv('HTTP_IF_NONE_MATCH', '') == ETag:
  926.         compressout.write_h('Status: 304\n\n')
  927.         return False
  928.     else:
  929.         return True
  930. def is_injection_attempt(path_param, referer_URI, referer_title):
  931.     '''
  932.     Various checks to see if any form of injection attempt has been
  933.     made.  This function checks the `path`, `referer` and `title`
  934.     parameters.
  935.     
  936.     Returns True if the request is an injection attempt.
  937.     
  938.     - XSS
  939.     - URL injection
  940.     - Spam injection
  941.     - Restricted files access
  942.     '''
  943.     # If the path parameter contains an XSS attempt, it can't be corrected
  944.     evil = False
  945.     # Prevent attacks.
  946.     if '..' in path_param:
  947.         return True
  948.     for var in referer_URI, referer_title:
  949.         for ch in var:
  950.             if ord(ch) < 32:
  951.                 return True
  952.             if ch in '<>&\'"':
  953.                 return True
  954.             # NOTICE: The following will limit parameters to ASCII.
  955.             if ord(ch) > 126:
  956.                 return True
  957.     # Prevent linking to Mallory.
  958.     for start in ('http://', 'https://', '//', 'ftp://'):
  959.         if referer_URI.startswith(start):
  960.             hostname = referer_URI.split('//')[1].split('/')[0]
  961.             if hostname not in conf['allowed-referer-hosts']:
  962.                 return True
  963.             else:
  964.                 break
  965.     else:
  966.         if ':' in referer_URI:
  967.             return True
  968.     # Prevent injected spam
  969.     if spammy.spammy(referer_title) or len(referer_title) > 42:
  970.         return True
  971.     # No match.
  972.     return False
  973. def handle_injection_attempt(path_param, referer_URI, referer_title):
  974.     '''
  975.     Decide if the injection attempt was due to innocently following
  976.     a malicious link or due to creating one.
  977.     '''
  978.     # Check if the URL can be sanitized.
  979.     if is_injection_attempt(path_param, '', ''):
  980.         destination = 'https://en.wikipedia.org/wiki/Data_validation'
  981.     else:
  982.         destination = my_url + '?path=' + path_param
  983.     redirect_spam(destination)
  984. def main():
  985.     '''
  986.     `compressout.init` MUST be called before `main`
  987.     and `compressout.done` after.
  988.     '''
  989.     # HTML vs XHTML
  990.     global html_page
  991.     html_page = 'Vary: Accept\n'
  992.     if 'application/xhtml+xml' in os.getenv('HTTP_ACCEPT', ''):
  993.         html_page += 'Content-Type: application/xhtml+xml; charset=UTF-8\n'
  994.     else:
  995.         html_page += 'Content-Type: text/html; charset=UTF-8\n'
  996.     # Check that the method is either GET, HEAD or OPTIONS.
  997.     if os.getenv('REQUEST_METHOD') not in ('GET', 'HEAD'):
  998.         if os.getenv('REQUEST_METHOD') != 'OPTIONS':
  999.             compressout.write_h('Status: 405\n')
  1000.         compressout.write_h('Allow: GET, HEAD, OPTIONS\n')
  1001.         compressout.write_h('Content-Type: text/plain\n')
  1002.         compressout.write_h('\n')
  1003.         if os.getenv('REQUEST_METHOD') != 'OPTIONS':
  1004.             compressout.write_b('Method not allowed!\n')
  1005.         compressout.write_b('Allowed methods: GET, HEAD, OPTIONS\n')
  1006.         return
  1007.     # Get the parameters.
  1008.     params = cgi.FieldStorage()
  1009.     path = path_param = params.getfirst('path', default='')
  1010.     referer_URI = params.getfirst('referer', default='')
  1011.     referer_title = params.getfirst('title', default='Back')
  1012.     referer = (referer_URI, referer_title)
  1013.     download_flag = params.getfirst('download', default='no')
  1014.     sitemap_param = params.getfirst('sitemap', default='none')
  1015.     
  1016.     if not os.getenv('QUERY_STRING'):
  1017.         index_page()
  1018.         return
  1019.         
  1020.     # Bad request, but will match the evil patterns.
  1021.     # Keep it before the evil stopper.
  1022.     if bool(path_param) and not path_param.startswith('/'):
  1023.         status400('`path` is not relative to this site. (No leading slash.)')
  1024.         return
  1025.     
  1026.     # Do not allow evil requests.
  1027.     allow = True
  1028.     # Keep things within the server root.
  1029.     try:
  1030.         path = os.path.realpath(root + path)
  1031.     except:
  1032.         allow = False
  1033.     if path != root and not path.startswith(root + '/'):
  1034.         allow = False
  1035.     # Stop at forbidden paths. #1/2
  1036.     for regex in conf['noaccess']:
  1037.         if re.match(regex, path[rootlen:]) is not None:
  1038.             allow = False
  1039.     
  1040.     # Prevent XSS, URL injection, spam injection and miscellaneous assholery.
  1041.     if is_injection_attempt(path_param, referer_URI, referer_title):
  1042.         allow = False
  1043.     if not allow:
  1044.         handle_injection_attempt(path_param, referer_URI, referer_title)
  1045.         return
  1046.     
  1047.     # Bad requests:
  1048.     if download_flag not in ('yes', 'no'):
  1049.         status400('`download` MUST be "yes", "no" or unset.')
  1050.         return
  1051.     if bool(path_param) and sitemap_param != 'none':
  1052.         status400('The `sitemap` parameter cannot be used with any other.')
  1053.         return
  1054.     if download_flag == 'yes' and bool(referer_URI):
  1055.         status400("`download=yes` can't be used with the `referer` parameter.")
  1056.         return
  1057.     if sitemap_param not in ('none', 'xml', 'html'):
  1058.         status400('`sitemap` MUST be "html", "xml" or unset.')
  1059.         return
  1060.     if download_flag == 'yes' and not bool(path_param):
  1061.         status400('Nothing to `download`. Use the `path` parameter.')
  1062.         return
  1063.     if bool(referer_URI) and not bool(path_param):
  1064.         status400('`referer` cannot be used without `path`')
  1065.         return
  1066.     if referer_title != 'Back' and not bool(referer_URI):
  1067.         status400('`referer` is not set.')
  1068.         return
  1069.     
  1070.     if allow:
  1071.     # Generate sitemap?
  1072.         if sitemap_param != 'none':
  1073.             sitemap(sitemap_param)
  1074.         else:
  1075.             # Stop at forbidden paths. #2/2
  1076.             for regex in conf['topsecret']:
  1077.                 if re.match(regex, path[rootlen:]) is not None:
  1078.                     status404()
  1079.                     break
  1080.             else:
  1081.                 # Allowed to be seen.
  1082.                 try:
  1083.                     os.listdir(path)
  1084.                     if download_flag == 'no':
  1085.                         if if_none_match(path):
  1086.                             ls(path, referer)
  1087.                     else:
  1088.                         status400("Can't download a directory.")
  1089.                 except OSError as e:
  1090.                     if e.errno == errno.ENOTDIR:
  1091.                         if download_flag == 'no':
  1092.                             if if_none_match(path):
  1093.                                 cat(path, referer)
  1094.                         else:
  1095.                             # `download` sets a few headers.
  1096.                             download(path)
  1097.                     elif e.errno == errno.ENOENT:
  1098.                         status404()
  1099.                     else:
  1100.                         raise ValueError(
  1101.                             'errno must be either ENOTDIR or ENOENT'
  1102.                         )
  1103. if __name__ == '__main__':
  1104.     compressout.init()
  1105.     main()
  1106.     compressout.done()