Disavow file compressor

'shrink-disavow' minifies disavow files for Google Search Console / Webmaster Tools. It converts multiple complete URLs on the same domain to the domain: syntax.

Last modified
Lines 102

Parent directory Download CGIread sitemap Main page

Quick links: main

  1. #!/usr/bin/python
  2. # Copyright (c) Oskar Skog, 2015
  3. # 
  4. # Redistribution and use in source and binary forms, with or without
  5. # modification, are permitted provided that the following conditions are met:
  6. # 
  7. # 1.  Redistributions of source code must retain the above copyright notice,
  8. #     this list of conditions and the following disclaimer.
  9. # 
  10. # 2.  Redistributions in binary form must reproduce the above copyright notice,
  11. #     this list of conditions and the following disclaimer in the documentation
  12. #     and/or other materials provided with the distribution.
  13. # 
  14. # This software is provided by the copyright holders and contributors "as is"
  15. # and any express or implied warranties, including, but not limited to, the
  16. # implied warranties of merchantability and fitness for a particular purpose
  17. # are disclaimed. In no event shall the copyright holder or contributors be
  18. # liable for any direct, indirect, incidental, special, exemplary, or
  19. # consequential damages (including, but not limited to, procurement of
  20. # substitute goods or services; loss of use, data, or profits; or business
  21. # interruption) however caused and on any theory of liability, whether in
  22. # contract, strict liability, or tort (including negligence or otherwise)
  23. # arising in any way out of the use of this software, even if advised of the
  24. # possibility of such damage.
  25. '''
  26. 'shrink-disavow' minifies disavow files for Google Search Console / Webmaster Tools.
  27. It converts multiple complete URLs on the same domain to the domain: syntax.
  28. '''
  29. def main(infile, outfile, numlinks):
  30.     '''
  31. This script minifies disavow files for
  32. Google Search Console / Webmaster Tools
  33. `num_of_links` is the number of URLs from the same domain that will are
  34. required for replacing all those URLs with a single 'domain:'.
  35. `infile` and `outfile` are filenames.
  36.     '''
  37.     
  38.     URLs = []
  39.     domains = []
  40.     
  41.     # Find URLs and domains.
  42.     for line in open(infile).read().split('\n'):
  43.         content = line.split('#')[0].strip()
  44.         if content:
  45.             if content.startswith('domain'):
  46.                 domains.append(content.split(':', 1)[1].strip())
  47.             else:
  48.                 domain = content.split('://')[1].split('/')[0]
  49.                 URLs.append((domain, content))
  50.     
  51.     # Find domains that appear several (`numlinks`) times in the *URLs*.
  52.     for domain, URL in URLs:
  53.         if domain not in domains:
  54.             if len(filter(lambda x: x[0] == domain, URLs)) >= numlinks:
  55.                 domains.append(domain)
  56.     
  57.     # Eliminate URLs that would be redundant as
  58.     # they are disavowed with the domain syntax.
  59.     for domain in domains:
  60.         URLs = filter(lambda x: x[0] != domain, URLs)
  61.     
  62.     # Generate output lines.
  63.     outlines = list(map(lambda x: x[1], URLs))
  64.     outlines += list(map(lambda x: 'domain: {}'.format(x), domains))
  65.     # Sort them so that manual examination of the output will be easier.
  66.     outlines.sort()
  67.     
  68.     f = open(outfile, 'w')
  69.     f.write('# This file is autogenerated from "{}".\n'.format(infile))
  70.     
  71.     oldline = None
  72.     for line in outlines:
  73.         # No need for uniq(1).
  74.         if line != oldline:
  75.             f.write(line + '\n')
  76.             oldline = line
  77. if __name__ == '__main__':
  78.     import sys
  79.     if len(sys.argv) == 3:
  80.         main(sys.argv[1], sys.argv[2], 2)
  81.     elif len(sys.argv) == 4:
  82.         main(sys.argv[1], sys.argv[2], int(sys.argv[3]))
  83.     else:
  84.         sys.stderr.write('''This script minifies disavow files for
  85. Google Search Console / Webmaster Tools
  86. Usage: script_name {infile} {outfile} [num_of_links]
  87. num_of_links is the number of URLs from the same domain that will are
  88. required for replacing all those URLs with a single 'domain:'.
  89. The default value is two.
  90. ''')