#!/usr/bin/python # Copyright (c) Oskar Skog, 2015 # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # This software is provided by the copyright holders and contributors "as is" # and any express or implied warranties, including, but not limited to, the # implied warranties of merchantability and fitness for a particular purpose # are disclaimed. In no event shall the copyright holder or contributors be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or business # interruption) however caused and on any theory of liability, whether in # contract, strict liability, or tort (including negligence or otherwise) # arising in any way out of the use of this software, even if advised of the # possibility of such damage. ''' 'shrink-disavow' minifies disavow files for Google Search Console / Webmaster Tools. It converts multiple complete URLs on the same domain to the domain: syntax. ''' def main(infile, outfile, numlinks): ''' This script minifies disavow files for Google Search Console / Webmaster Tools `num_of_links` is the number of URLs from the same domain that will are required for replacing all those URLs with a single 'domain:'. `infile` and `outfile` are filenames. ''' URLs = [] domains = [] # Find URLs and domains. for line in open(infile).read().split('\n'): content = line.split('#')[0].strip() if content: if content.startswith('domain'): domains.append(content.split(':', 1)[1].strip()) else: domain = content.split('://')[1].split('/')[0] URLs.append((domain, content)) # Find domains that appear several (`numlinks`) times in the *URLs*. for domain, URL in URLs: if domain not in domains: if len(filter(lambda x: x[0] == domain, URLs)) >= numlinks: domains.append(domain) # Eliminate URLs that would be redundant as # they are disavowed with the domain syntax. for domain in domains: URLs = filter(lambda x: x[0] != domain, URLs) # Generate output lines. outlines = list(map(lambda x: x[1], URLs)) outlines += list(map(lambda x: 'domain: {}'.format(x), domains)) # Sort them so that manual examination of the output will be easier. outlines.sort() f = open(outfile, 'w') f.write('# This file is autogenerated from "{}".\n'.format(infile)) oldline = None for line in outlines: # No need for uniq(1). if line != oldline: f.write(line + '\n') oldline = line if __name__ == '__main__': import sys if len(sys.argv) == 3: main(sys.argv[1], sys.argv[2], 2) elif len(sys.argv) == 4: main(sys.argv[1], sys.argv[2], int(sys.argv[3])) else: sys.stderr.write('''This script minifies disavow files for Google Search Console / Webmaster Tools Usage: script_name {infile} {outfile} [num_of_links] num_of_links is the number of URLs from the same domain that will are required for replacing all those URLs with a single 'domain:'. The default value is two. ''')