Disavow file compressor
'shrink-disavow' minifies disavow files for Google Search Console / Webmaster Tools. It converts multiple complete URLs on the same domain to the domain: syntax.
Last modified | |
Lines | 102 |
Parent directory Download CGIread sitemap Main page
Quick links: main
#!/usr/bin/python
# Copyright (c) Oskar Skog, 2015
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# This software is provided by the copyright holders and contributors "as is"
# and any express or implied warranties, including, but not limited to, the
# implied warranties of merchantability and fitness for a particular purpose
# are disclaimed. In no event shall the copyright holder or contributors be
# liable for any direct, indirect, incidental, special, exemplary, or
# consequential damages (including, but not limited to, procurement of
# substitute goods or services; loss of use, data, or profits; or business
# interruption) however caused and on any theory of liability, whether in
# contract, strict liability, or tort (including negligence or otherwise)
# arising in any way out of the use of this software, even if advised of the
# possibility of such damage.
'''
'shrink-disavow' minifies disavow files for Google Search Console / Webmaster Tools.
It converts multiple complete URLs on the same domain to the domain: syntax.
'''
def main(infile, outfile, numlinks):
'''
This script minifies disavow files for
Google Search Console / Webmaster Tools
`num_of_links` is the number of URLs from the same domain that will are
required for replacing all those URLs with a single 'domain:'.
`infile` and `outfile` are filenames.
'''
URLs = []
domains = []
# Find URLs and domains.
for line in open(infile).read().split('\n'):
content = line.split('#')[0].strip()
if content:
if content.startswith('domain'):
domains.append(content.split(':', 1)[1].strip())
else:
domain = content.split('://')[1].split('/')[0]
URLs.append((domain, content))
# Find domains that appear several (`numlinks`) times in the *URLs*.
for domain, URL in URLs:
if domain not in domains:
if len(filter(lambda x: x[0] == domain, URLs)) >= numlinks:
domains.append(domain)
# Eliminate URLs that would be redundant as
# they are disavowed with the domain syntax.
for domain in domains:
URLs = filter(lambda x: x[0] != domain, URLs)
# Generate output lines.
outlines = list(map(lambda x: x[1], URLs))
outlines += list(map(lambda x: 'domain: {}'.format(x), domains))
# Sort them so that manual examination of the output will be easier.
outlines.sort()
f = open(outfile, 'w')
f.write('# This file is autogenerated from "{}".\n'.format(infile))
oldline = None
for line in outlines:
# No need for uniq(1).
if line != oldline:
f.write(line + '\n')
oldline = line
if __name__ == '__main__':
import sys
if len(sys.argv) == 3:
main(sys.argv[1], sys.argv[2], 2)
elif len(sys.argv) == 4:
main(sys.argv[1], sys.argv[2], int(sys.argv[3]))
else:
sys.stderr.write('''This script minifies disavow files for
Google Search Console / Webmaster Tools
Usage: script_name {infile} {outfile} [num_of_links]
num_of_links is the number of URLs from the same domain that will are
required for replacing all those URLs with a single 'domain:'.
The default value is two.
''')