#Simple script to dereplicate a set of fasta sequences. #Just looks for identity on the argv[1] file and write dereplicated #and sorted by abundance sequences back to argv[2] __author__='Hans-Joachim Ruscheweyh' import Bio.SeqIO.FastaIO as FastaIO import collections import sys import textwrap import gzip seq2count = collections.defaultdict(list) pos = 0 sys.stderr.write('Starting...\n') with open('../input/OMDv2.0_AA_GENO_noX.faa', 'r') as handle: for (header, sequence) in FastaIO.SimpleFastaParser(handle): seq2count[sequence].append(header.split()[0]) pos += 1 if pos % 1000000 == 0: sys.stderr.write('\r ') sys.stderr.write('\rRead {:,}\t{:,}'.format(pos, len(seq2count))) sys.stderr.write('\r ') sys.stderr.write('\rRead {:,}'.format(pos)) sys.stderr.write('\n') pos = 1 outfile = 'OMDBv2.0_AA_G_NR100.faa' outfile2 = 'OMDBv2.0_AA_G_NR100.cluster.tsv' prefix = 'OMDBv2.0_AA_G_NR100' with open(outfile, 'w') as handle, open(outfile2, 'w') as handle2: handle2.write('CLUSTER\tLENGTH\tSIZE\tREPRESENTATIVE\tMEMBERS\n') for seq, members in seq2count.items(): pp = str(pos).zfill(12) members = sorted(members) rep = members[0] ptmp = ';'.join(members) handle.write(f'>{prefix}_{pp}\trep={rep};size={len(members)}\n{seq}\n') handle2.write(f'{prefix}_{pp}\t{len(seq)}\t{len(members)}\t{rep}\t{ptmp}\n') pos += 1 if pos % 10000 == 0: sys.stderr.write('\r ') sys.stderr.write('\rWrote {:,}\t{:,}'.format(pos, len(seq2count))) sys.stderr.write('\r ') sys.stderr.write('\rWrote {:,}'.format(pos)) sys.stderr.write('\nFinished...\n')