source code of /read/index.py

Last modified
Lines 1469

Parent directory Download CGIread sitemap Main page

Quick links: cat code contact content description download footer forms handle_injection_attempt if_none_match index_page is_injection_attempt isword ls main mk_description mk_navigation mk_referer_param navigation noindex ol_content redirect_spam sitemap syntax title

  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. root = '/var/www'
  4. owner = 'Oskar Skog'
  5. my_url = '/read/'
  6. canonical_url = 'https://oskog97.com/read/'
  7. html403file = '/var/www/oops/403.html'
  8. html404file = '/var/www/oops/404.html'
  9. html503file = '/var/www/oops/cgi503.html'
  10. import sys
  11. sys.path.append(root)
  12. import cgi
  13. import os
  14. import errno
  15. import compressout
  16. import base64
  17. import re
  18. import time
  19. import htmlescape
  20. import string
  21. import spammy
  22. import sitemap as mod_sitemap  # Name conflict with already existing function.
  23. import cgitb
  24. cgitb.enable()
  25. rootlen = len(root)
  26. #html_mime = 'text/html'      # Set to XHTML later.
  27. html_page = 'Content-Type: text/html; charset=UTF-8\n'  # Set to XHTML later.
  28. conf = eval(open('read.cfg').read())
  29. def redirect_spam(destination):
  30.     '''`destination` is the URL to which assholes should be redirected.'''
  31.     compressout.write_h('Status: 303\n')
  32.     compressout.write_h('Location: {}\n'.format(destination))
  33.     compressout.write_h('\n')
  34. def status400(message):
  35.     '''HTTP 400; `message` goes UNESCAPED inside a <pre> element.'''
  36.     compressout.write_h('Status: 400\n')
  37.     compressout.write_h(html_page)
  38.     compressout.write_h('\n')
  39.     compressout.write_b('''<!DOCTYPE html>
  40. <html lang="en" xmlns="http://www.w3.org/1999/xhtml">
  41.     <head>
  42.         <meta charset="utf-8"/>
  43.         <meta name="viewport" content="width=device-width, initial-scale=1"/>
  44.         <link rel="stylesheet" href="https://oskog97.com/style.css" type="text/css"/>
  45.         <link rel="icon" type="image/png" href="/favicon.png"/>
  46.         <link rel="canonical" href="https://oskog97.com/read/"/>
  47. <!-- End html5 macro. -->
  48.         <title>400 - Bad Request</title>
  49.     </head>
  50.     <body>
  51.         
  52. <!-- BEGIN autogenerated navigation -->
  53. <nav><div id="navigation"><div id="nav_inner">
  54. <p><a href="#content" class="textonly">Skip navigation</a></p>
  55. <p class="row">
  56. <span class="textonly" translate="no">[</span><a class="head" href="/">Home</a><span class="textonly" translate="no">]</span>
  57. &gt;&gt;
  58. <span class="textonly" translate="no">[</span><a class="sub" href="/projects/anonymine/">Anonymine</a><span class="textonly" translate="no">]</span>
  59. <span class="textonly" translate="no">[</span><a class="sub" href="/projects/light-sensor/">Analog light sensor</a><span class="textonly" translate="no">]</span>
  60. <span class="textonly" translate="no">[</span><a class="sub" href="/projects/PLLM-M702A/">Reverse-engineered schematics for PLLM-M702A</a><span class="textonly" translate="no">]</span>
  61. <span class="textonly" translate="no">[</span><a class="sub" href="/small-scripts/">Small scripts</a><span class="textonly" translate="no">]</span>
  62. <span class="textonly" translate="no">[</span><a class="sub" href="/thinkpad/">-&gt; My IBM thinkpad</a><span class="textonly" translate="no">]</span>
  63. </p>
  64. <p class="row">
  65. <span class="textonly" translate="no">[</span><a class="head" href="/small-scripts/">Small scripts</a><span class="textonly" translate="no">]</span>
  66. &gt;&gt;
  67. <span class="textonly" translate="no">]</span><span class="sub active">Website's scripts</span><span class="textonly" translate="no">[</span>
  68. </p>
  69. <p class="row">
  70. <span class="textonly" translate="no">[</span><a class="sub" href="/sitemap.py">Sitemap</a><span class="textonly" translate="no">]</span>
  71. </p>
  72. <hr class="textonly"/>
  73. </div></div></nav>
  74. <!-- END autogenerated navigation -->
  75.         <main><div id="content">
  76.             <h1 id="title">400 - Bad Request</h1>
  77.             <pre>{}</pre>
  78.             <p>
  79.                 Your request can't be understood.
  80.                 Check the parameters.
  81.             </p>
  82.             <p><a href="/read/">Documentation for the parameters</a></p>
  83.         </div></main>
  84. '''.format(message))
  85.     compressout.write_b('''
  86.         
  87. <!-- INCLUDED FOOTER -->
  88.     <hr class="textonly"/>
  89.     <p>
  90.         Copyright © Oskar Skog<br/>
  91.         Website content released under the <a
  92.         href="https://creativecommons.org/licenses/by/4.0/" rel="license noopener"
  93.         target="_blank">Creative Commons Attribution (CC-BY 4.0)</a> license
  94.         and my software usually under the <span class="a"><a target="_blank"
  95.         rel="noopener"
  96.         href="https://opensource.org/licenses/BSD-2-Clause">FreeBSD license
  97.         (2-clause)</a>.</span>
  98.         <br/>
  99.         Images may be from other sites, I should have cited useful sources
  100.         somewhere on the page.
  101.         <span class="notprint">Contact me if I haven't.</span>
  102.     </p>
  103.     <p id="contact" class="notprint">
  104.         You can contact me at: <a href="mailto:oskar@oskog97.com"
  105.         rel="noopener" target="_blank">oskar@oskog97.com</a>
  106.         <span class="a">(<a href="/pgp-pub/oskar.asc"
  107.                             >PGP public key</a>)</span>
  108.     </p>
  109.     <p> <a class="notprint" href="https://oskog97.com/read/?path=/style.css">
  110.             CSS Stylesheet
  111.         </a>
  112.         <a href="https://validator.w3.org/check/referrer" rel="nofollow noopener"
  113.             target="_blank" class="notprint"><span
  114.             class="img">Valid HTML5</span
  115.         ></a><br/>
  116.     </p>
  117. </div></footer>
  118. <!-- END OF INCLUDED FOOTER -->
  119.     </body>
  120. </html>''')
  121. def status403():
  122.     '''HTTP 403'''
  123.     compressout.write_h(html_page)
  124.     compressout.write_h('Status: 403\n\n')
  125.     compressout.write_b(open(html403file).read())
  126. def status404():
  127.     '''HTTP 404'''
  128.     compressout.write_h('Status: 404\n')
  129.     compressout.write_h(html_page)
  130.     compressout.write_h('\n')
  131.     compressout.write_b(open(html404file).read())
  132. def status503():
  133.     '''
  134.     HTTP 503
  135.     
  136.     Call this if there is too much load on the server to do something.
  137.     (Used by the sitemap function.)
  138.     '''
  139.     compressout.write_h('Status: 503\n')
  140.     compressout.write_h(html_page)
  141.     # One factor is load avg for 1 minute, add some slop to the delay for bots.
  142.     compressout.write_h('Retry-After: 90\n')
  143.     compressout.write_h('\n')
  144.     compressout.write_b(open(html503file).read())
  145. def index_page():
  146.     '''https://oskog97.com/read/'''
  147.     # Handle 304s.
  148.     ETag = '"{}{}{}"'.format(
  149.         'x'*('application/xhtml+xml' in html_page),
  150.         'z'*('gzip' in os.getenv('HTTP_ACCEPT_ENCODING', '')),
  151.         os.stat('index.py').st_mtime,
  152.     )
  153.     compressout.write_h('Vary: If-None-Match\n')
  154.     compressout.write_h('ETag: {}\n'.format(ETag))
  155.     compressout.write_h(html_page)
  156.     if os.getenv('HTTP_IF_NONE_MATCH') == ETag:
  157.         compressout.write_h('Status: 304\n\n')
  158.         return
  159.     compressout.write_h('\n')
  160.     if os.getenv('REQUEST_METHOD') == 'HEAD':
  161.         return
  162.     # Write out a static page.
  163.     compressout.write_b('''<!DOCTYPE html>
  164. <html lang="en" xmlns="http://www.w3.org/1999/xhtml">
  165.     <head>
  166.         <meta charset="utf-8"/>
  167.         <meta name="viewport" content="width=device-width, initial-scale=1"/>
  168.         <link rel="stylesheet" href="https://oskog97.com/style.css" type="text/css"/>
  169.         <link rel="icon" type="image/png" href="/favicon.png"/>
  170.         <link rel="canonical" href="https://oskog97.com/read/"/>
  171. <!-- End html5 macro. -->
  172.     <!-- With canonical link tag. -->
  173.         <link rel="stylesheet" type="text/css" href="/read/style.css"/>
  174.         <meta name="description" content="Interested in the scripts I have
  175.         on my website? Come and take a look at them."/>
  176.         <title>Website's scripts</title>
  177.     </head>
  178.     <body>
  179.         
  180. <!-- BEGIN autogenerated navigation -->
  181. <nav><div id="navigation"><div id="nav_inner">
  182. <p><a href="#content" class="textonly">Skip navigation</a></p>
  183. <p class="row">
  184. <span class="textonly" translate="no">[</span><a class="head" href="/">Home</a><span class="textonly" translate="no">]</span>
  185. &gt;&gt;
  186. <span class="textonly" translate="no">[</span><a class="sub" href="/projects/anonymine/">Anonymine</a><span class="textonly" translate="no">]</span>
  187. <span class="textonly" translate="no">[</span><a class="sub" href="/projects/light-sensor/">Analog light sensor</a><span class="textonly" translate="no">]</span>
  188. <span class="textonly" translate="no">[</span><a class="sub" href="/projects/PLLM-M702A/">Reverse-engineered schematics for PLLM-M702A</a><span class="textonly" translate="no">]</span>
  189. <span class="textonly" translate="no">[</span><a class="sub" href="/small-scripts/">Small scripts</a><span class="textonly" translate="no">]</span>
  190. <span class="textonly" translate="no">[</span><a class="sub" href="/thinkpad/">-&gt; My IBM thinkpad</a><span class="textonly" translate="no">]</span>
  191. </p>
  192. <p class="row">
  193. <span class="textonly" translate="no">[</span><a class="head" href="/small-scripts/">Small scripts</a><span class="textonly" translate="no">]</span>
  194. &gt;&gt;
  195. <span class="textonly" translate="no">]</span><span class="sub active">Website's scripts</span><span class="textonly" translate="no">[</span>
  196. </p>
  197. <p class="row">
  198. <span class="textonly" translate="no">[</span><a class="sub" href="/sitemap.py">Sitemap</a><span class="textonly" translate="no">]</span>
  199. </p>
  200. <hr class="textonly"/>
  201. </div></div></nav>
  202. <!-- END autogenerated navigation -->
  203.         <main><div id="content">
  204.             <h1 id="title">Website's scripts</h1>
  205.     ''')
  206.     compressout.write_b('''
  207.             <p>
  208.                 Interested in the scripts I have on my website?
  209.                 Go take a look at them; start crawling the
  210.                 <a href="{0}?path=/">root directory</a> or take a look
  211.                 at the <span class="a"><a href="{0}?sitemap=html"
  212.                 >(sub)sitemap</a>.</span>
  213.             </p>
  214.             <div id="syntax">
  215.                 <h2>Parameter syntax</h2>
  216.                 <p>
  217.                     Descriptions for the parameters can be found in
  218.                     the request forms.
  219.                 </p>
  220.                 <ul>
  221.                     <li>
  222.                         Asterisks <q>*</q> represent a value that can be
  223.                         (almost) anything.
  224.                     </li>
  225.                     <li>Square brackets <q>[]</q> represent optional.</li>
  226.                     <li>Curly brackets <q>&#x7b;&#x7d;</q> represent mandatory.</li>
  227.                     <li>Pipes <q>|</q> represent either or.</li>
  228.                 </ul>
  229.                 <p>There are three acceptable "sets" of parameters:</p>
  230.                 <ol>
  231. <li><pre>{0}?sitemap=&#x7b;html|xml&#x7d;</pre></li>
  232. <li><pre>{0}?path=*[&amp;download=yes]</pre></li>
  233. <li><pre>{0}?path=*[&amp;referer=*[&amp;title=*]]</pre></li>
  234.                 </ol>
  235.                 <p>
  236.                     The order of the valid parameters doesn't matter, but
  237.                     this is the recommended/canonical order.
  238.                 </p>
  239.             </div>
  240.             <div id="forms">
  241.                 <h2>Request forms</h2>
  242.                 <p><strong>
  243.                     Notice that these are three different forms.
  244.                 </strong></p>
  245.                 <form action="{0}" method="get">
  246.                 <h3>Sitemap</h3>
  247.                 <p>
  248.                     The <code>sitemap</code> parameter can be either
  249.                     <q><code>html</code></q>, <q><code>xml</code></q>
  250.                     or the default <q><code>none</code></q>.
  251.                     It can't be used together with any other parameters.
  252.                 </p>
  253.                 <p>
  254.                     <input type="radio" name="sitemap" value="html"/>
  255.                     Request an HTML sitemap instead of a page<br/>
  256.                     <input type="radio" name="sitemap" value="xml"/>
  257.                     request an XML sitemap instead of a page<br/>
  258.                     <input type="submit"/>
  259.                 </p>
  260.                 </form>
  261.                 <form action="{0}" method="get">
  262.                 <h3>Page</h3>
  263.                 <p>
  264.                     A page (source code of a CGI script) is selected with the
  265.                     <code>path</code> parameter.  The value of the
  266.                     <code>path</code> parameter is a URL relative to this
  267.                     site, ie. an URL beginning with a single slash.
  268.                 </p>
  269.                 <p>
  270.                     The <code>path</code> is the site-local URL to the CGI
  271.                     script or directory you're interested in.  If you set the
  272.                     value to <q><code>/read/index.py</code></q>, you'll get the
  273.                     source code for this script. And if you set it to
  274.                     <q><code>/</code></q>, you'll get a directory listing
  275.                     of the site's root directory.
  276.                 </p>
  277.                 <p>
  278.                     Path/URL: <input type="text" name="path" value="/"/>
  279.                     <input type="submit"/><br/>
  280.                     <input type="checkbox" name="download" value="yes"/>
  281.                     Download / see it as plain text
  282.                     
  283.                 </p>
  284.                 <p>
  285.                     The <code>download</code> parameter can be set to either
  286.                     <q><code>yes</code></q> or the default
  287.                     <q><code>no</code></q>.  The download option does
  288.                     obviously not work with directories.
  289.                 </p>
  290.                 </form>
  291.                 <form action="{0}" method="get">
  292.                 <h3>Link back to a referencing page</h3>
  293.                 <p>
  294.                     If <code>download</code> is <q><code>no</code></q> or
  295.                     unset and a page (not a sitemap) was requested, it is
  296.                     possible to change the navigation to make the requested
  297.                     page link back to a referring page.
  298.                 </p>
  299.                 <p>
  300.                     The <code>referer</code> (yes, misspelled like the HTTP
  301.                     Referer) parameter is the URL of the referencing page.
  302.                     (Don't try to specify a site that isn't mine.)
  303.                     The <code>title</code> parameter gives the back link a
  304.                     different text than <q>Back</q>.
  305.                 </p>
  306.                 <table>
  307.                     <tr>
  308.                         <th><code>path</code></th>
  309.                         <td><input type="text" name="path" value="/"/></td>
  310.                     </tr>
  311.                     <tr>
  312.                         <th><code>referer</code></th>
  313.                         <td><input type="text" name="referer"/></td>
  314.                     </tr>
  315.                     <tr>
  316.                         <th><code>title</code></th>
  317.                         <td><input type="text" name="title"/></td>
  318.                     </tr>
  319.                     <tr>
  320.                         <td></td>
  321.                         <td><input type="submit"/></td>
  322.                     </tr>
  323.                 </table>
  324.                 </form>
  325.             </div>
  326.         </div></main>
  327.     '''.format(my_url))
  328.     compressout.write_b('''
  329.         
  330. <!-- INCLUDED FOOTER -->
  331. <footer><div id="footer">
  332.     <hr class="textonly"/>
  333.     <p>
  334.         Copyright © Oskar Skog<br/>
  335.         Website content released under the <a
  336.         href="https://creativecommons.org/licenses/by/4.0/" rel="license noopener"
  337.         target="_blank">Creative Commons Attribution (CC-BY 4.0)</a> license
  338.         and my software usually under the <span class="a"><a target="_blank"
  339.         rel="noopener"
  340.         href="https://opensource.org/licenses/BSD-2-Clause">FreeBSD license
  341.         (2-clause)</a>.</span>
  342.         <br/>
  343.         Images may be from other sites, I should have cited useful sources
  344.         somewhere on the page.
  345.         <span class="notprint">Contact me if I haven't.</span>
  346.     </p>
  347.     <p id="contact" class="notprint">
  348.         You can contact me at: <a href="mailto:oskar@oskog97.com"
  349.         rel="noopener" target="_blank">oskar@oskog97.com</a>
  350.         <span class="a">(<a href="/pgp-pub/oskar.asc"
  351.                             >PGP public key</a>)</span>
  352.     </p>
  353.     <p> <a class="notprint" href="https://oskog97.com/read/?path=/style.css">
  354.             CSS Stylesheet
  355.         </a>
  356.         <a href="https://validator.w3.org/check/referrer" rel="nofollow noopener"
  357.             target="_blank" class="notprint"><span
  358.             class="img">Valid HTML5</span
  359.         ></a><br/>
  360.     </p>
  361. </div></footer>
  362. <!-- END OF INCLUDED FOOTER -->
  363.     </body>
  364. </html>
  365. ''')
  366. def noindex(path):
  367.     '''
  368.     Returns True if `path` should be noindexed.
  369.     
  370.     `path` is an absolute **filesystem** path.
  371.     '''
  372.     def isword(w):
  373.         letters = string.ascii_letters + ',.'
  374.         for ch in w:
  375.             if w not in letters:
  376.                 return False
  377.         return True
  378.     # 1. White list
  379.     # 2. Black list
  380.     # 3. Page quality (not applicable for directories)
  381.     
  382.     # Check whitelist first.
  383.     for regex in conf['doindex']:
  384.         if re.match(regex, path[rootlen:]) is not None:
  385.             return False
  386.             break
  387.     
  388.     # Blacklist (two kinds):
  389.     # - Generated from another file.
  390.     # - Explicitly blacklisted in 'read.cfg'.
  391.     for match, replace in conf['madefrom']:
  392.         if re.match(match, path[rootlen:]) is not None:
  393.             try:
  394.                 os.stat(root + re.sub(match, replace, path[rootlen:]))
  395.                 return True
  396.             except:
  397.                 pass
  398.     for regex in conf['noindex'] + conf['hide']:
  399.         if re.match(regex, path[rootlen:]) is not None:
  400.             return True
  401.     
  402.     # Quality:
  403.     #   - Text file
  404.     #   - At least 3072 Unicode code points
  405.     #   - At least 300 words
  406.     #   - At least 60 lines
  407.     #   - Half the limitations if a meta description and title is found
  408.     #   - A third of the limimitations if an onpage description is found
  409.     try:
  410.         os.listdir(path)
  411.         return False
  412.     except:
  413.         pass
  414.     # Normal file.
  415.     try:
  416.         if sys.version_info[0] > 2:
  417.             text = open(path).read()
  418.         else:
  419.             text = open(path).read().decode('utf-8')
  420.     except:
  421.         return True
  422.     min_chars, min_words, min_lines, min_comments = 3072, 300, 60, 24
  423.     quality = mk_description(path)[0] + 1
  424.     min_chars //= quality; min_words //= quality
  425.     min_lines //= quality; min_comments //= quality
  426.     if len(text) < min_chars:
  427.         return True
  428.     if text.count('\n') + 1 < min_lines:
  429.         return True
  430.     n_comments = 0
  431.     is_comment = re.compile('^(.*#.*| *\\* .*|.*<!--.*|.*\'\'\'.*)$')
  432.     for line in text.split('\n'):
  433.         if re.match(is_comment, line) is not None:
  434.             n_comments += 1
  435.     if n_comments < min_comments:
  436.         return True
  437.     if len(list(filter(isword, text.replace('\n', ' ').split(' ')))) < min_words:
  438.         return True
  439.     # Passed the quality tests:
  440.     return False
  441. def mk_navigation(referer, title):
  442.     '''
  443.     Returns a string which is the navigation bar's HTML.
  444.     
  445.     `title` is the title of the requested page.
  446.     
  447.     `referer` is used to **optionally** ``integrate`` a page.
  448.     `referer` is a tuple of (URL, title) for the "back" link.
  449.     '''
  450.     if referer[0]:
  451.         return htmlescape.escape('''<!-- Navigation generated by CGIread. -->
  452. <nav><div id="navigation"><div id="nav_inner">
  453. <p><a href="#content" class="textonly">Skip navigation</a></p>
  454. <p class="row">
  455. <span class="textonly" translate="no">[</span><a class="head" href="{URL}">{title}</a><span class="textonly" translate="no">]</span>
  456. &gt;&gt;
  457. <span class="textonly" translate="no">]</span><span class="sub active">{me}</span><span class="textonly" translate="no">[</span>
  458. <span class="textonly" translate="no">[</span><a class="sub" href="{my_url}?sitemap=html">Sitemap for website's scripts</a><span class="textonly" translate="no">]</span>
  459. </p>
  460. <p class="row">
  461. <span class="textonly" translate="no">[</span><a class="head" href="/">Home</a><span class="textonly" translate="no">]</span>
  462. &gt;&gt;
  463. <span class="textonly" translate="no">[</span><a class="sub" href="/read/">Website's scripts</a><span class="textonly" translate="no">]</span>
  464. <span class="textonly" translate="no">[</span><a class="sub" href="/pages/policy.html">Privacy policy &amp; terms of use</a><span class="textonly" translate="no">]</span>
  465. <span class="textonly" translate="no">[</span><a class="sub" href="/sitemap.py">Sitemap</a><span class="textonly" translate="no">]</span>
  466. </p>
  467. <hr class="textonly"/>
  468. </div></div></nav>
  469. <!-- End of navigation. -->''',
  470.             URL=(2, referer[0]),
  471.             title=(1, referer[1]),
  472.             me=(1, title),
  473.             my_url=(0, my_url),
  474.         )
  475.     else:
  476.         return '''
  477. <!-- BEGIN autogenerated navigation -->
  478. <nav><div id="navigation"><div id="nav_inner">
  479. <p><a href="#content" class="textonly">Skip navigation</a></p>
  480. <p class="row">
  481. <span class="textonly" translate="no">[</span><a class="head" href="/">Home</a><span class="textonly" translate="no">]</span>
  482. &gt;&gt;
  483. <span class="textonly" translate="no">[</span><a class="sub" href="/projects/anonymine/">Anonymine</a><span class="textonly" translate="no">]</span>
  484. <span class="textonly" translate="no">[</span><a class="sub" href="/projects/light-sensor/">Analog light sensor</a><span class="textonly" translate="no">]</span>
  485. <span class="textonly" translate="no">[</span><a class="sub" href="/projects/PLLM-M702A/">Reverse-engineered schematics for PLLM-M702A</a><span class="textonly" translate="no">]</span>
  486. <span class="textonly" translate="no">[</span><a class="sub" href="/small-scripts/">Small scripts</a><span class="textonly" translate="no">]</span>
  487. <span class="textonly" translate="no">[</span><a class="sub" href="/thinkpad/">-&gt; My IBM thinkpad</a><span class="textonly" translate="no">]</span>
  488. </p>
  489. <p class="row">
  490. <span class="textonly" translate="no">[</span><a class="head" href="/small-scripts/">Small scripts</a><span class="textonly" translate="no">]</span>
  491. &gt;&gt;
  492. <span class="textonly" translate="no">]</span><span class="sub active">Website's scripts</span><span class="textonly" translate="no">[</span>
  493. </p>
  494. <p class="row">
  495. <span class="textonly" translate="no">[</span><a class="sub" href="/sitemap.py">Sitemap</a><span class="textonly" translate="no">]</span>
  496. </p>
  497. <hr class="textonly"/>
  498. </div></div></nav>
  499. <!-- END autogenerated navigation -->
  500. '''
  501. def mk_referer_param(referer):
  502.     '''Returns one of:
  503.         ''
  504.         '&referer=' + referer[0]
  505.         '&referer=' + referer[0] + '&title=' + referer[1]
  506.     to be added to links from the requested page.
  507.     
  508.     `referer` is used to **optionally** ``integrate`` a page.
  509.     See `mk_navigation`
  510.     '''
  511.     if referer[0]:
  512.         if referer[1] != 'Back':
  513.             title = '&title={}'.format(referer[1])
  514.         else:
  515.             title = ''
  516.         return '&referer={}{}'.format(referer[0], title)
  517.     else:
  518.         return ''
  519. def mk_description(path):
  520.     '''
  521.     Return three strings: (good, title, meta_description, onpage_description)
  522.     
  523.     `path` is the absolute filesystem path to the requested page.
  524.     
  525.     `good` is
  526.         0       no title and description
  527.         1       title and meta description only
  528.         2       also an onpage description
  529.     
  530.     `title` is the title of the page.
  531.     
  532.     `meta_description` is the content of the description meta tag.
  533.     
  534.     `onpage_description` is HTML content for the onpage description.
  535.     requested page.
  536.     '''
  537.     good = 0
  538.     title = "source code of {}".format(path[rootlen:])
  539.     meta_description = ''
  540.     onpage_description = None
  541.     try:
  542.         content = open(path + '.info').read().split('\n')
  543.         good = 1
  544.     except:
  545.         pass
  546.     if good:
  547.         title = content[0]
  548.         try:
  549.             sep = content.index('.')
  550.         except ValueError:
  551.             sep = None
  552.         if sep is not None:
  553.             good = 2
  554.             meta_description = '\n'.join(content[1:sep])
  555.             onpage_description = '\n'.join(content[sep+1:])
  556.         else:
  557.             meta_description = '\n'.join(content[1:])
  558.     if onpage_description is None:
  559.         onpage_description = htmlescape.escape('<p>{}</p>',1,meta_description)
  560.     return good, title, meta_description, onpage_description
  561. def sitemap(sitemap_type):
  562.     '''
  563.     Write out an XML or HTML sitemap.
  564.     sitemap_type in ('xml', 'html')
  565.     
  566.     The XML sitemap will exclude entries from `conf['noxmlsitemap']`.
  567.     '''    
  568.     
  569.     if os.getenv('REQUEST_METHOD') != 'HEAD': # NOTICE
  570.         # Prevent over-revving the server.
  571.         # HEAD requests are basically no-ops.
  572.         maxload = conf['sitemap-maxload']
  573.         if os.getloadavg()[0] > maxload['load-avg1']:
  574.             status503()
  575.             return
  576.         try:
  577.             access_times = list(map(
  578.                 float, open('read.throttlecontrol').read().strip().split(':')
  579.             ))
  580.         except:
  581.             access_times = [0]
  582.         if time.time() - access_times[-1] < maxload['throttle-time']:
  583.             status503()
  584.             return
  585.         access_times.insert(0, time.time())
  586.         access_times = access_times[:maxload['throttle-requests']]
  587.         f = open('read.throttlecontrol', 'w')
  588.         f.write(':'.join(list(map(str, access_times))) + '\n')
  589.         f.close()
  590.     # Write headers before doing anything else.
  591.     # A HEAD request doesn't need to know the length (it's TE chunked).
  592.     if sitemap_type == 'xml':
  593.         compressout.write_h('Content-Type: application/xml; charset=UTF-8\n')
  594.         compressout.write_h(
  595.             'Link: <{my_url}?sitemap=html>'.format(my_url=canonical_url) +
  596.             '; rel="canonical"' +
  597.             '; type="text/html"\n'
  598.         )
  599.         compressout.write_h('X-Robots-Tag: noindex\n\n') # NOTE: last.
  600.     elif sitemap_type == 'html':
  601.         compressout.write_h(html_page)
  602.         compressout.write_h('\n')
  603.     else:
  604.         assert False, "Neither 'xml' nor 'html'"
  605.     if os.getenv('REQUEST_METHOD') == 'HEAD': # NOTICE
  606.         return
  607.     
  608.     # Find the pages worth being in the sitemap.
  609.     no_access = conf['noaccess'] + conf['hide'] + conf['topsecret']
  610.     paths = []
  611.     
  612.     for basedir, dirs, files in os.walk(root, topdown=True):
  613.         # Exclude hidden directories:
  614.         remove_list = []
  615.         sys.stderr.write('In {}\n'.format(basedir))
  616.         sys.stderr.write('Dirs: {}\n'.format(repr(dirs)))
  617.         for dirname in dirs:
  618.             dirpath = os.path.join(basedir, dirname)[rootlen:]
  619.             for regex in no_access:
  620.                 if re.match(regex, dirpath) is not None:
  621.                     #dirs.remove(dirname)
  622.                     # BUG: The for loop will skip items in the list if
  623.                     # other items are removed while looping.
  624.                     # This caused some real' nasty stuff like sshin to
  625.                     # be crawled, took a whopping .65 seconds.
  626.                     remove_list.append(dirname)
  627.                     break
  628.         sys.stderr.write('Removed dirs: {}\n'.format(repr(remove_list)))
  629.         for dirname in remove_list:
  630.             dirs.remove(dirname)
  631.         
  632.         # Iterate over files:
  633.         for filename in files:
  634.             filepath = os.path.join(basedir, filename)
  635.             # No symlinks allowed.
  636.             #if os.stat(filepath).st_mode == os.lstat(filepath).st_mode:
  637.             if not os.path.islink(filepath):
  638.                 #try:
  639.                     description = mk_description(filepath)
  640.                     if description[0]:
  641.                         # Only indexable content allowed.
  642.                         if not noindex(filepath):
  643.                             paths.append((filepath[rootlen:], description[3]))
  644.                         else:
  645.                             sys.stderr.write('{} is noindexed\n'.format(filepath))
  646.                     else:
  647.                         sys.stderr.write('{} has no description\n'.format(filepath))
  648.                 #except IOError as error:
  649.                     #assert error.errno in (
  650.                         #errno.EISDIR, errno.EACCES
  651.                     #), error.errno
  652.             else:
  653.                 sys.stderr.write('{} is link\n'.format(filepath))
  654.     
  655.     paths.sort(key=lambda x: x[0])
  656.     
  657.     # Print the body.
  658.     if sitemap_type == 'xml':
  659.         compressout.write_b('''<?xml version="1.0" encoding="UTF-8"?>
  660. <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  661. ''')
  662.         #
  663.         for path, description in paths:
  664.             # Loop through all the regexes:
  665.             for regex in conf['noxmlsitemap']:
  666.                 if re.match(regex, path) is not None:
  667.                     break
  668.             else:
  669.                 compressout.write_b(htmlescape.escape('''<url>
  670.     <loc>{canonical_url}?path={path}</loc>
  671.     <priority>0.5</priority>
  672. ''',
  673.                     canonical_url=(0, canonical_url),
  674.                     path=(1, path),
  675.                 ))
  676.                 mod_sitemap.lastmod_changefreq(
  677.                     root + path,
  678.                     compressout,
  679.                 )
  680.                 compressout.write_b('</url>\n')
  681.         #
  682.         compressout.write_b('</urlset>\n')
  683.     elif sitemap_type == 'html':
  684.         compressout.write_b('''<!DOCTYPE html>
  685. <html lang="en" xmlns="http://www.w3.org/1999/xhtml">
  686.     <head>
  687.         <meta charset="utf-8"/>
  688.         <meta name="viewport" content="width=device-width, initial-scale=1"/>
  689.         <link rel="stylesheet" href="https://oskog97.com/style.css" type="text/css"/>
  690.         <link rel="icon" type="image/png" href="/favicon.png"/>
  691. <!-- End html5nc macro. -->
  692.         <link rel="canonical" href="{canonical_url}?sitemap=html"/>
  693.         <link rel="alternate" href="{canonical_url}?sitemap=xml"
  694.             type="application/xml"/>
  695.         <meta name="robots" content="noindex, follow"/>
  696.         <title>Sitemap for scripts' source code</title>
  697.         <meta name="description" content="
  698.             Sitemap of all scripts available through /read/.
  699.         "/>
  700.     </head>
  701.     <body>
  702.         
  703. <!-- BEGIN autogenerated navigation -->
  704. <nav><div id="navigation"><div id="nav_inner">
  705. <p><a href="#content" class="textonly">Skip navigation</a></p>
  706. <p class="row">
  707. <span class="textonly" translate="no">[</span><a class="head" href="/">Home</a><span class="textonly" translate="no">]</span>
  708. &gt;&gt;
  709. <span class="textonly" translate="no">[</span><a class="sub" href="/projects/anonymine/">Anonymine</a><span class="textonly" translate="no">]</span>
  710. <span class="textonly" translate="no">[</span><a class="sub" href="/projects/light-sensor/">Analog light sensor</a><span class="textonly" translate="no">]</span>
  711. <span class="textonly" translate="no">[</span><a class="sub" href="/projects/PLLM-M702A/">Reverse-engineered schematics for PLLM-M702A</a><span class="textonly" translate="no">]</span>
  712. <span class="textonly" translate="no">[</span><a class="sub" href="/small-scripts/">Small scripts</a><span class="textonly" translate="no">]</span>
  713. <span class="textonly" translate="no">[</span><a class="sub" href="/thinkpad/">-&gt; My IBM thinkpad</a><span class="textonly" translate="no">]</span>
  714. </p>
  715. <p class="row">
  716. <span class="textonly" translate="no">[</span><a class="head" href="/small-scripts/">Small scripts</a><span class="textonly" translate="no">]</span>
  717. &gt;&gt;
  718. <span class="textonly" translate="no">]</span><span class="sub active">Website's scripts</span><span class="textonly" translate="no">[</span>
  719. </p>
  720. <p class="row">
  721. <span class="textonly" translate="no">[</span><a class="sub" href="/sitemap.py">Sitemap</a><span class="textonly" translate="no">]</span>
  722. </p>
  723. <hr class="textonly"/>
  724. </div></div></nav>
  725. <!-- END autogenerated navigation -->
  726.         <main><div id="content" class="sitemap">
  727.             <h1 id="title">Sitemap for scripts' source code</h1>
  728.             <p><a href="{my_url}?path=/">Root directory</a></p>
  729.             <dl>
  730. '''.format(my_url=my_url, canonical_url=canonical_url))
  731.         #
  732.         indent = 16 * ' '
  733.         for path, description in paths:
  734.             compressout.write_b(indent + htmlescape.escape(
  735.                 '''<dt><a translate="no" href="{my_url}?path={path}">
  736.                     {path}
  737.                 </a></dt>\n''',
  738.                 path=(0, path),
  739.                 my_url=(0, canonical_url),
  740.             ))
  741.             compressout.write_b(indent +
  742.                 htmlescape.escape('<dd>{}</dd>\n', 0, description)
  743.             )
  744.         #
  745.         compressout.write_b('''            </dl>
  746.         </div></main>
  747.         
  748. <!-- INCLUDED FOOTER -->
  749. <footer><div id="footer">
  750.     <hr class="textonly"/>
  751.     <p>
  752.         Copyright © Oskar Skog<br/>
  753.         Website content released under the <a
  754.         href="https://creativecommons.org/licenses/by/4.0/" rel="license noopener"
  755.         target="_blank">Creative Commons Attribution (CC-BY 4.0)</a> license
  756.         and my software usually under the <span class="a"><a target="_blank"
  757.         rel="noopener"
  758.         href="https://opensource.org/licenses/BSD-2-Clause">FreeBSD license
  759.         (2-clause)</a>.</span>
  760.         <br/>
  761.         Images may be from other sites, I should have cited useful sources
  762.         somewhere on the page.
  763.         <span class="notprint">Contact me if I haven't.</span>
  764.     </p>
  765.     <p id="contact" class="notprint">
  766.         You can contact me at: <a href="mailto:oskar@oskog97.com"
  767.         rel="noopener" target="_blank">oskar@oskog97.com</a>
  768.         <span class="a">(<a href="/pgp-pub/oskar.asc"
  769.                             >PGP public key</a>)</span>
  770.     </p>
  771.     <p> <a class="notprint" href="https://oskog97.com/read/?path=/style.css">
  772.             CSS Stylesheet
  773.         </a>
  774.         <a href="https://validator.w3.org/check/referrer" rel="nofollow noopener"
  775.             target="_blank" class="notprint"><span
  776.             class="img">Valid HTML5</span
  777.         ></a><br/>
  778.     </p>
  779. </div></footer>
  780. <!-- END OF INCLUDED FOOTER -->
  781.     </body>
  782. </html>
  783. ''')
  784.     else:
  785.         assert False, "Neither 'xml' nor 'html'"
  786. def ls(path, referer):
  787.     '''
  788.     '''
  789.     compressout.write_h(html_page)
  790.     compressout.write_h('\n')
  791.     if os.getenv('REQUEST_METHOD') == 'HEAD':
  792.         return
  793.     compressout.write_b('''<!DOCTYPE html>
  794. <html lang="en" xmlns="http://www.w3.org/1999/xhtml">
  795.     <head>
  796.         <meta charset="utf-8"/>
  797.         <meta name="viewport" content="width=device-width, initial-scale=1"/>
  798.         <link rel="stylesheet" href="https://oskog97.com/style.css" type="text/css"/>
  799.         <link rel="icon" type="image/png" href="/favicon.png"/>
  800. <!-- End html5nc macro. -->
  801. ''')
  802.     compressout.write_b(htmlescape.escape('''
  803.         <link rel="stylesheet" type="text/css" href="/read/style.css"/>
  804.         <title>Index of {name}</title>
  805.         <meta name="robots" content="{robots_follow}, noindex"/>
  806.         <link rel="canonical" href="{canonical_url}?path={name}"/>
  807.     </head>
  808.     <body>
  809.         {navigation}
  810.         <main><div id="content" class="ls">
  811.             <h1 id="title">Index of <span translate="no">{name}</span></h1>
  812.             <p class="read-nav">
  813.                 {isroot_commentout_start}
  814.                     <a href="{my_url}?path={parent_path}{referer_params}">
  815.                         Parent directory
  816.                     </a>
  817.                 {isroot_commentout_end}
  818.                 <a href="{my_url}?sitemap=html">CGIread sitemap</a>
  819.                 <a href="{my_url}">Main page</a>
  820.             </p>
  821.             <table id="ls">
  822.             ''',
  823.             name          =(1, path[rootlen:] + '/'),
  824.             parent_path   =(2, '/'.join(path.split('/')[:-1])[rootlen:]+'/'),
  825.             robots_follow =(2, 'no'*noindex(path)+'follow'),
  826.             navigation    =(0, mk_navigation(
  827.                                 referer,
  828.                                 "Index of "+path[rootlen:]+'/'
  829.                             )),
  830.             referer_params=(2, mk_referer_param(referer)),
  831.             my_url=(0, my_url),
  832.             canonical_url=(0, canonical_url),
  833.             isroot_commentout_start=(0, '<!--'*(path == root)),
  834.             isroot_commentout_end=(0, '-->'*(path == root)),
  835.         ))
  836.     no_access = conf['noaccess'] + conf['hide'] + conf['topsecret']
  837.     
  838.     for x in sorted(os.listdir(path)):
  839.         full_path = os.path.join(path, x)
  840.         
  841.         forbidden = False
  842.         for regex in no_access:
  843.             if re.match(regex, full_path[rootlen:]) is not None:
  844.                 forbidden = True
  845.                 break
  846.         if forbidden:
  847.             continue
  848.         
  849.         #url = cgi.escape(full_path, quote=True)
  850.         try:
  851.             os.listdir(full_path)
  852.             is_dir = 1
  853.         except:
  854.             is_dir = 0
  855.         # mobile_desc
  856.         # desktop_desc
  857.         if is_dir:
  858.            mobile_desc = '<span class="yeah">-&gt;</span>'
  859.            desktop_desc = '<span class="yeah">Directory</span>'
  860.         else:
  861.             try:
  862.                 content = open(full_path).read()        # This fails on Python 3 !!!
  863.                 if sys.version_info[0] == 2:
  864.                     content.decode('UTF-8')
  865.                 binary = False
  866.             except:
  867.                 binary = True
  868.             if binary:
  869.                 desktop_desc = 'Binary'
  870.                 mobile_desc = ':-('
  871.             else:
  872.                 good, title, meta_d, onpage_d = mk_description(full_path)
  873.                 if good == 2:
  874.                     desktop_desc = htmlescape.escape(
  875.                         '<span class="thenumberofthebeast">{}</span>',
  876.                         1, meta_d
  877.                     )
  878.                     if noindex(full_path):
  879.                         mobile_desc = '<span class="yeah">:-)</span>'
  880.                     else:
  881.                         mobile_desc = '<span class="thenumberofthebeast">:-D</span>'
  882.                 elif not noindex(full_path):
  883.                     mobile_desc = '<span class="yeah">:-)</span>'
  884.                     if compressout.debug_cookie:
  885.                         desktop_desc = '<span class="yeah">Text; indexable</span>'
  886.                     else:
  887.                         desktop_desc = '<span class="yeah">Text</span>'
  888.                 else:
  889.                     mobile_desc = ':-|'
  890.                     if compressout.debug_cookie:
  891.                         desktop_desc = 'Boring; unindexable'
  892.                     else:
  893.                         desktop_desc = 'Looks boring'
  894.                     
  895.         compressout.write_b(
  896.             htmlescape.escape(
  897.                 '''<tr><td class="mobile">{mobile_desc}</td>
  898.                 <td><a translate="no"
  899.                     href="{site}?path={path}{referer}">{text}</a></td>
  900.                 <td class="desktop">{desktop_desc}</td></tr>
  901.                 ''',
  902.                 site=(0, my_url),
  903.                 path=(2, full_path[rootlen:] + '/'*is_dir),
  904.                 referer=(2, mk_referer_param(referer)),
  905.                 text=(1, x + '/'*is_dir),
  906.                 mobile_desc=(0, mobile_desc),
  907.                 desktop_desc=(0, desktop_desc),
  908.             )
  909.         )
  910.     compressout.write_b('''            <!--</p>--></table>
  911.         </div></main>
  912.         
  913. <!-- INCLUDED FOOTER -->
  914. <footer><div id="footer">
  915.     <hr class="textonly"/>
  916.     <p>
  917.         Copyright © Oskar Skog<br/>
  918.         Website content released under the <a
  919.         href="https://creativecommons.org/licenses/by/4.0/" rel="license noopener"
  920.         target="_blank">Creative Commons Attribution (CC-BY 4.0)</a> license
  921.         and my software usually under the <span class="a"><a target="_blank"
  922.         rel="noopener"
  923.         href="https://opensource.org/licenses/BSD-2-Clause">FreeBSD license
  924.         (2-clause)</a>.</span>
  925.         <br/>
  926.         Images may be from other sites, I should have cited useful sources
  927.         somewhere on the page.
  928.         <span class="notprint">Contact me if I haven't.</span>
  929.     </p>
  930.     <p id="contact" class="notprint">
  931.         You can contact me at: <a href="mailto:oskar@oskog97.com"
  932.         rel="noopener" target="_blank">oskar@oskog97.com</a>
  933.         <span class="a">(<a href="/pgp-pub/oskar.asc"
  934.                             >PGP public key</a>)</span>
  935.     </p>
  936.     <p> <a class="notprint" href="https://oskog97.com/read/?path=/style.css">
  937.             CSS Stylesheet
  938.         </a>
  939.         <a href="https://validator.w3.org/check/referrer" rel="nofollow noopener"
  940.             target="_blank" class="notprint"><span
  941.             class="img">Valid HTML5</span
  942.         ></a><br/>
  943.     </p>
  944. </div></footer>
  945. <!-- END OF INCLUDED FOOTER -->
  946.     </body>
  947. </html>\n''')
  948. def download(path):
  949.     if noindex(path):
  950.         compressout.write_h('X-Robots-Tag: noindex\n')
  951.     else:
  952.         compressout.write_h('X-Robots-Tag: index\n') # For verbosity.
  953.     try:
  954.         content = open(path).read()
  955.         if sys.version_info[0] == 2:
  956.             content.decode('utf-8')
  957.         compressout.write_h('Content-Type: text/plain; charset=UTF-8\n')
  958.         compressout.write_h(htmlescape.escape(
  959.                 'Link: <{}?path={}>',
  960.                 0, canonical_url,
  961.                 2, path[rootlen:]
  962.             ) + '; rel="canonical"; type="text/html"\n'
  963.         )
  964.     except:
  965.         compressout.write_h(htmlescape.escape(
  966.             'Link: <{}?path={}>; rel="canonical"\n',
  967.             0, canonical_url,
  968.             2, path[rootlen:]
  969.         )) # No type specified.
  970.     if if_none_match(path):
  971.         compressout.write_h('\n')
  972.         if os.getenv('REQUEST_METHOD') != 'HEAD':
  973.             compressout.write_b(content)
  974. def cat(path, referer):
  975.     '''
  976.     '''
  977.     def ol_content(text):
  978.         out_lines = []
  979.         ids = []
  980.         allowed_chars = string.ascii_letters + '_-'
  981.         for index, line in enumerate(text.split('\n')):
  982.             # Create a "permanent" fragment this line.
  983.             this_id = ''
  984.             # Find ids in Python and XHTML
  985.             for decltype in ('def', 'class'):
  986.                 if line.strip().startswith(decltype + ' ') and '(' in line:
  987.                     this_id = line.split(decltype, 1)[1].split('(')[0].strip()
  988.             if 'id="' in line:
  989.                 this_id = line.split('id="')[1].split('"')[0]
  990.             # Prevent bad ids.
  991.             for ch in this_id:
  992.                 if ch not in allowed_chars:
  993.                     this_id = ''
  994.                     break
  995.             if this_id in ids:
  996.                 this_id = ''
  997.             # Create the fragment identifier for the line.
  998.             if this_id:
  999.                 ids.append(this_id)
  1000.                 idline = 'id="content_{}"'.format(this_id)
  1001.             else:
  1002.                 idline = ''
  1003.             # Create line
  1004.             out_lines.append(htmlescape.escape(
  1005.                     '    <li id="{}"><pre translate="no" {}>{}</pre></li>\n',
  1006.                     0, index + 1,
  1007.                     0, idline,
  1008.                     1, line,
  1009.             ))
  1010.         fragment_links = []
  1011.         for fragment in sorted(ids):
  1012.             fragment_links.append(
  1013.                 (
  1014.                     '<a class="quick" href="#content_{0}" translate="no"' +
  1015.                     '>{0}</a>\n'
  1016.                 ).format(
  1017.                     fragment
  1018.                 )
  1019.             )
  1020.         return ''.join(out_lines), ''.join(fragment_links)
  1021.     
  1022.     try:
  1023.         content = open(path).read()
  1024.         if sys.version_info[0] == 2:
  1025.             content.decode('utf-8')
  1026.     except:
  1027.         if noindex(path):
  1028.             compressout.write_h('X-Robots-Tag: noindex\n')
  1029.         else:
  1030.             compressout.write_h('X-Robots-Tag: index\n')
  1031.         compressout.write_h('\n')
  1032.         compressout.write_b(content)
  1033.         return
  1034.     compressout.write_h(html_page)
  1035.     compressout.write_h('\n')
  1036.     if os.getenv('REQUEST_METHOD') == 'HEAD':
  1037.         return
  1038.     
  1039.     ignore, title, meta_description, p_description = mk_description(path)
  1040.     last_modified = time.strftime('%F', time.gmtime(os.stat(path).st_mtime))
  1041.     
  1042.     lines, fragment_links = ol_content(content)
  1043.     if not fragment_links:
  1044.         fragment_links = '(none)'
  1045.     
  1046.     compressout.write_b('''<!DOCTYPE html>
  1047. <html lang="en" xmlns="http://www.w3.org/1999/xhtml">
  1048.     <head>
  1049.         <meta charset="utf-8"/>
  1050.         <meta name="viewport" content="width=device-width, initial-scale=1"/>
  1051.         <link rel="stylesheet" href="https://oskog97.com/style.css" type="text/css"/>
  1052.         <link rel="icon" type="image/png" href="/favicon.png"/>
  1053. <!-- End html5nc macro. -->
  1054. ''')
  1055.     compressout.write_b('''
  1056. <script type="application/ld+json">
  1057. {
  1058.     "@context":
  1059.     {
  1060.         "@vocab": "http://schema.org/"
  1061.     },
  1062.     "@type": "SoftwareSourceCode",
  1063.     "license": "https://opensource.org/licenses/BSD-2-Clause",
  1064.     "author":
  1065.     {
  1066.     ''')
  1067.     compressout.write_b('''
  1068.         "@type": "Person",
  1069.         "@id": "https://oskog97.com/",
  1070.         "name": "{0}",
  1071.         "url": "https://oskog97.com/"
  1072.     '''.format(owner))
  1073.     compressout.write_b('''
  1074.     },
  1075.     "publisher": {"@id": "https://oskog97.com/"},
  1076.     "copyrightHolder": {"@id": "https://oskog97.com/"},
  1077.     ''')
  1078.     compressout.write_b('''
  1079.     "url": "{}#code",
  1080.     "DateModified": "{}"
  1081.     '''.format(
  1082.         canonical_url + '?path=' + path[rootlen:],
  1083.         last_modified,
  1084.     ))
  1085.     compressout.write_b('''
  1086. }
  1087. </script>
  1088.     ''')
  1089.     parent_link = '/'.join(path.split('/')[:-1])[rootlen:]+'/'
  1090.     compressout.write_b(htmlescape.escape('''
  1091.         <link rel="stylesheet" type="text/css" href="/read/style.css"/>
  1092.         <title>{title}</title>
  1093.         <link rel="canonical" href="{canonical}"/>
  1094.         <link
  1095.             rel="alternate"
  1096.             href="{canonical}&amp;download=yes"
  1097.             type="text/plain"
  1098.         />
  1099.         <meta name="robots" content="{noindex_no}index"/>
  1100.         <meta name="description" content="{meta_description}"/>
  1101.     </head>
  1102.     <body>
  1103.         {navigation}
  1104. <main><div id="content">
  1105.     <h1 id="title" translate="no">{title}</h1>
  1106.     <div id="description">
  1107.         {content_description}
  1108.     </div>
  1109.     <table>
  1110.         <tr>
  1111.             <td>Last modified</td>
  1112.             <td><time datetime="{last_modified}">{last_modified}</time></td>
  1113.         </tr>
  1114.         <tr>
  1115.             <td>Lines</td>
  1116.             <td>{linecount}</td>
  1117.         </tr>
  1118.         {begin_debug}<tr>
  1119.             <td>Indexable</td>
  1120.             <td>{indexable}</td>
  1121.         </tr>{end_debug}
  1122.     </table>
  1123.     <p class="notprint read-nav">
  1124.         <a href="{my_url}?path={parent_dir}">Parent directory</a>
  1125.         <a href="{my_url}?path={path}&amp;download=yes" target="_blank">Download</a>
  1126.         <a href="{my_url}?sitemap=html">CGIread sitemap</a>
  1127.         <a href="{my_url}">Main page</a>
  1128.     </p>
  1129.     <p class="notprint">
  1130.         Quick links:\n{fragments}
  1131.     </p>
  1132. <ol id="code">
  1133. {content}
  1134. </ol>
  1135. </div></main>
  1136. ''',
  1137.         title=(2, title),
  1138.         content=(0, lines),
  1139.         parent_dir=(2, parent_link + mk_referer_param(referer)),
  1140.         navigation=(0, mk_navigation(referer, path[rootlen:])),
  1141.         canonical=(2, canonical_url + '?path=' + path[rootlen:]),
  1142.         path=(2, path[rootlen:]),
  1143.         noindex_no=(2, 'no' * noindex(path)),
  1144.         meta_description=(2, meta_description),
  1145.         content_description=(0, p_description),
  1146.         last_modified=(2, last_modified),
  1147.         linecount=(1, content.count('\n') + 1),
  1148.         indexable=(0, {True: 'No', False: 'Yes'}[noindex(path)]),
  1149.         fragments=(0, fragment_links),
  1150.         my_url=(0, my_url),
  1151.         begin_debug=(0,['<!--',''][compressout.debug_cookie]),
  1152.         end_debug=(0,['-->',''][compressout.debug_cookie]),
  1153.     ))
  1154.     compressout.write_b('''
  1155.         
  1156. <!-- INCLUDED FOOTER -->
  1157. <footer><div id="footer">
  1158.     <hr class="textonly"/>
  1159.     <p>
  1160.         Copyright © Oskar Skog<br/>
  1161.         Website content released under the <a
  1162.         href="https://creativecommons.org/licenses/by/4.0/" rel="license noopener"
  1163.         target="_blank">Creative Commons Attribution (CC-BY 4.0)</a> license
  1164.         and my software usually under the <span class="a"><a target="_blank"
  1165.         rel="noopener"
  1166.         href="https://opensource.org/licenses/BSD-2-Clause">FreeBSD license
  1167.         (2-clause)</a>.</span>
  1168.         <br/>
  1169.         Images may be from other sites, I should have cited useful sources
  1170.         somewhere on the page.
  1171.         <span class="notprint">Contact me if I haven't.</span>
  1172.     </p>
  1173.     <p id="contact" class="notprint">
  1174.         You can contact me at: <a href="mailto:oskar@oskog97.com"
  1175.         rel="noopener" target="_blank">oskar@oskog97.com</a>
  1176.         <span class="a">(<a href="/pgp-pub/oskar.asc"
  1177.                             >PGP public key</a>)</span>
  1178.     </p>
  1179.     <p> <a class="notprint" href="https://oskog97.com/read/?path=/style.css">
  1180.             CSS Stylesheet
  1181.         </a>
  1182.         <a href="https://validator.w3.org/check/referrer" rel="nofollow noopener"
  1183.             target="_blank" class="notprint"><span
  1184.             class="img">Valid HTML5</span
  1185.         ></a><br/>
  1186.     </p>
  1187. </div></footer>
  1188. <!-- END OF INCLUDED FOOTER -->
  1189.     </body>
  1190. </html>
  1191. ''')
  1192. def if_none_match(path):
  1193.     '''
  1194.     ETag handling for `cat`, `ls` and `download`:
  1195.     
  1196.     
  1197.     Returns `True` if content needs to be generated.
  1198.     Outputs necessary headers and 304 statuses.
  1199.     '''
  1200.     try:
  1201.         meta_time = os.stat(path + '.info').st_mtime
  1202.     except:
  1203.         meta_time = 0
  1204.     if sys.version_info[0] > 2:
  1205.         query_string = os.getenv('QUERY_STRING', '').encode('utf-8')
  1206.     else:
  1207.         query_string = os.getenv('QUERY_STRING', '')
  1208.     ETag = '"{}{}-{}({})-{}-({}-{})"'.format(
  1209.         'x'*('application/xhtml+xml' in html_page),
  1210.         'z'*('gzip' in os.getenv('HTTP_ACCEPT_ENCODING', '')),
  1211.         os.stat(path).st_mtime,
  1212.         meta_time,
  1213.         base64.b64encode(query_string),
  1214.         os.stat('index.py').st_mtime,
  1215.         os.stat('read.cfg').st_mtime,
  1216.     )
  1217.     compressout.write_h('Vary: If-None-Match\n')
  1218.     compressout.write_h('ETag: {}\n'.format(ETag))
  1219.     compressout.write_h(
  1220. '''X-ETag-Synopsis: [x][z]-<f_time>(<m_time>)-<query>-(<s_time>-<c_time>)
  1221. X-ETag-Description-x: "Client accepts application/xhtml+xml"
  1222. X-ETag-Description-z: "Content-Encoding: gzip"
  1223. X-ETag-Description-f_time: "Unix last modified time for the requested file"
  1224. X-ETag-Description-m_time: "Unix last modified time for the file's metadata"
  1225. X-ETag-Description-query: "base64 encoded $QUERY_STRING"
  1226. X-ETag-Description-s_time: "Unix last modified time for '/read/index.py'"
  1227. X-ETag-Description-c_time: "Unix last modified time for '/read/read.cfg'"
  1228. ''')
  1229.     if os.getenv('HTTP_IF_NONE_MATCH', '') == ETag:
  1230.         compressout.write_h('Status: 304\n\n')
  1231.         return False
  1232.     else:
  1233.         return True
  1234. def is_injection_attempt(path_param, referer_URI, referer_title):
  1235.     '''
  1236.     Various checks to see if any form of injection attempt has been
  1237.     made.  This function checks the `path`, `referer` and `title`
  1238.     parameters.
  1239.     
  1240.     Returns True if the request is an injection attempt.
  1241.     
  1242.     - XSS
  1243.     - URL injection
  1244.     - Spam injection
  1245.     - Restricted files access
  1246.     '''
  1247.     # If the path parameter contains an XSS attempt, it can't be corrected
  1248.     evil = False
  1249.     # Prevent attacks.
  1250.     if '..' in path_param:
  1251.         return True
  1252.     for var in referer_URI, referer_title:
  1253.         for ch in var:
  1254.             if ord(ch) < 32:
  1255.                 return True
  1256.             if ch in '<>&\'"':
  1257.                 return True
  1258.             # NOTICE: The following will limit parameters to ASCII.
  1259.             if ord(ch) > 126:
  1260.                 return True
  1261.     # Prevent linking to Mallory.
  1262.     for start in ('http://', 'https://', '//', 'ftp://'):
  1263.         if referer_URI.startswith(start):
  1264.             hostname = referer_URI.split('//')[1].split('/')[0]
  1265.             if hostname not in conf['allowed-referer-hosts']:
  1266.                 return True
  1267.             else:
  1268.                 break
  1269.     else:
  1270.         if ':' in referer_URI:
  1271.             return True
  1272.     # Prevent injected spam
  1273.     if spammy.spammy(referer_title) or len(referer_title) > 42:
  1274.         return True
  1275.     # No match.
  1276.     return False
  1277. def handle_injection_attempt(path_param, referer_URI, referer_title):
  1278.     '''
  1279.     Decide if the injection attempt was due to innocently following
  1280.     a malicious link or due to creating one.
  1281.     '''
  1282.     # Check if the URL can be sanitized.
  1283.     if is_injection_attempt(path_param, '', ''):
  1284.         destination = 'https://en.wikipedia.org/wiki/Data_validation'
  1285.     else:
  1286.         destination = my_url + '?path=' + path_param
  1287.     redirect_spam(destination)
  1288. def main():
  1289.     '''
  1290.     `compressout.init` MUST be called before `main`
  1291.     and `compressout.done` after.
  1292.     '''
  1293.     # HTML vs XHTML
  1294.     global html_page
  1295.     html_page = 'Vary: Accept\n'
  1296.     if 'application/xhtml+xml' in os.getenv('HTTP_ACCEPT', ''):
  1297.         html_page += 'Content-Type: application/xhtml+xml; charset=UTF-8\n'
  1298.     else:
  1299.         html_page += 'Content-Type: text/html; charset=UTF-8\n'
  1300.     # Check that the method is either GET, HEAD or OPTIONS.
  1301.     if os.getenv('REQUEST_METHOD') not in ('GET', 'HEAD'):
  1302.         if os.getenv('REQUEST_METHOD') != 'OPTIONS':
  1303.             compressout.write_h('Status: 405\n')
  1304.         compressout.write_h('Allow: GET, HEAD, OPTIONS\n')
  1305.         compressout.write_h('Content-Type: text/plain\n')
  1306.         compressout.write_h('\n')
  1307.         if os.getenv('REQUEST_METHOD') != 'OPTIONS':
  1308.             compressout.write_b('Method not allowed!\n')
  1309.         compressout.write_b('Allowed methods: GET, HEAD, OPTIONS\n')
  1310.         return
  1311.     # Get the parameters.
  1312.     params = cgi.FieldStorage()
  1313.     path = path_param = params.getfirst('path', default='')
  1314.     referer_URI = params.getfirst('referer', default='')
  1315.     referer_title = params.getfirst('title', default='Back')
  1316.     referer = (referer_URI, referer_title)
  1317.     download_flag = params.getfirst('download', default='no')
  1318.     sitemap_param = params.getfirst('sitemap', default='none')
  1319.     
  1320.     if not os.getenv('QUERY_STRING'):
  1321.         index_page()
  1322.         return
  1323.         
  1324.     # Bad request, but will match the evil patterns.
  1325.     # Keep it before the evil stopper.
  1326.     if bool(path_param) and not path_param.startswith('/'):
  1327.         status400('`path` is not relative to this site. (No leading slash.)')
  1328.         return
  1329.     
  1330.     # Do not allow evil requests.
  1331.     allow = True
  1332.     # Keep things within the server root.
  1333.     try:
  1334.         path = os.path.realpath(root + path)
  1335.     except:
  1336.         allow = False
  1337.     if path != root and not path.startswith(root + '/'):
  1338.         allow = False
  1339.     # Stop at forbidden paths. #1/2
  1340.     for regex in conf['noaccess']:
  1341.         if re.match(regex, path[rootlen:]) is not None:
  1342.             allow = False
  1343.     
  1344.     # Prevent XSS, URL injection, spam injection and miscellaneous assholery.
  1345.     if is_injection_attempt(path_param, referer_URI, referer_title):
  1346.         allow = False
  1347.     if not allow:
  1348.         handle_injection_attempt(path_param, referer_URI, referer_title)
  1349.         return
  1350.     
  1351.     # Bad requests:
  1352.     if download_flag not in ('yes', 'no'):
  1353.         status400('`download` MUST be "yes", "no" or unset.')
  1354.         return
  1355.     if bool(path_param) and sitemap_param != 'none':
  1356.         status400('The `sitemap` parameter cannot be used with any other.')
  1357.         return
  1358.     if download_flag == 'yes' and bool(referer_URI):
  1359.         status400("`download=yes` can't be used with the `referer` parameter.")
  1360.         return
  1361.     if sitemap_param not in ('none', 'xml', 'html'):
  1362.         status400('`sitemap` MUST be "html", "xml" or unset.')
  1363.         return
  1364.     if download_flag == 'yes' and not bool(path_param):
  1365.         status400('Nothing to `download`. Use the `path` parameter.')
  1366.         return
  1367.     if bool(referer_URI) and not bool(path_param):
  1368.         status400('`referer` cannot be used without `path`')
  1369.         return
  1370.     if referer_title != 'Back' and not bool(referer_URI):
  1371.         status400('`referer` is not set.')
  1372.         return
  1373.     
  1374.     if allow:
  1375.     # Generate sitemap?
  1376.         if sitemap_param != 'none':
  1377.             sitemap(sitemap_param)
  1378.         else:
  1379.             # Stop at forbidden paths. #2/2
  1380.             for regex in conf['topsecret']:
  1381.                 if re.match(regex, path[rootlen:]) is not None:
  1382.                     status404()
  1383.                     break
  1384.             else:
  1385.                 # Allowed to be seen.
  1386.                 try:
  1387.                     os.listdir(path)
  1388.                     if download_flag == 'no':
  1389.                         if if_none_match(path):
  1390.                             ls(path, referer)
  1391.                     else:
  1392.                         status400("Can't download a directory.")
  1393.                 except OSError as e:
  1394.                     if e.errno == errno.ENOTDIR:
  1395.                         if download_flag == 'no':
  1396.                             if if_none_match(path):
  1397.                                 cat(path, referer)
  1398.                         else:
  1399.                             # `download` sets a few headers.
  1400.                             download(path)
  1401.                     elif e.errno == errno.ENOENT:
  1402.                         status404()
  1403.                     else:
  1404.                         raise ValueError(
  1405.                             'errno must be either ENOTDIR or ENOENT'
  1406.                         )
  1407. if __name__ == '__main__':
  1408.     compressout.init()
  1409.     main()
  1410.     compressout.done()