Source code for unique

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

'''
Unique: Ensure all IP addresses are unique in the output of Resolution.

.. moduleauthor:: Damiano Boppart <hat.guy.repo@gmail.com>

Copyright 2014 Damiano Boppart

This file is part of ECN-Spider.
'''

import sys
import argparse
#from publicsuffix import PublicSuffixList
#from urllib.parse import urlparse
import pandas as pd
#import numpy as np
#import scipy as sp
import datetime


[docs]def arguments(argv): ''' Parse the command-line arguments. :param argv: The command line. :returns: The return value of ``argparse.ArgumentParser.parse_args``. ''' parser = argparse.ArgumentParser(description='Ensure all IP addresses are unique in the output of Resolution.', epilog='This program is part of ECN-Spider.') parser.add_argument('input_file', type=str, help='Input domain list.') parser.add_argument('output_file', type=str, help='Output file.') args = parser.parse_args(argv) return args
[docs]def get_input(file_): ''' Read input CSV file and return it as a DataFrame. :param file_: The filename. :returns: The DataFrame. ''' d = pd.read_csv(file_, names='rank,domain,ipv4,ipv6'.split(','), dtype={'rank': 'int32', 'domain': 'object', 'ipv4': 'object', 'ipv6': 'object'}, na_values=['']) d = d.sort(columns='rank') return d
[docs]def unique_col(df, col_name): ''' Remove duplicate records based on the values of only one column of the DataFrame. :param df: The DataFrame. :param col_name: The column name. ''' x = df[df[col_name].isnull()] y = df[df[col_name].notnull()].drop_duplicates(col_name) df = pd.concat([x, y]) df = df.sort(columns='rank') return df
[docs]def main(argv): ''' Method to be called when run from the command line. ''' t0 = datetime.datetime.now() # Timestamp for runtime estimation args = arguments(argv) d = get_input(args.input_file) l_0 = len(d) #print(d.ipv4.value_counts()) d = d[d.ipv4.notnull() | d.ipv6.notnull()] l_w_ip = len(d) d = unique_col(d, 'ipv4') l_4 = len(d) d = unique_col(d, 'ipv6') l_46 = len(d) d.to_csv(args.output_file, columns='rank,domain,ipv4,ipv6'.split(','), header=False, index=False) t1 = datetime.datetime.now() MSG = '''Completed. Total runtime: {}. Original length: {}. After removing noIP entries: {} ({:.2f}%). After removing IPv4 dupes: {} ({:.2f}%). After removing IPv6 dupes: {} ({:.2f}%).''' print(MSG.format( t1 - t0, l_0, l_w_ip, l_w_ip / l_0 * 100, l_4, l_4 / l_0 * 100, l_46, l_46 / l_0 * 100)) return 0
if __name__ == '__main__': sys.exit(main(sys.argv[1:]))