采用kofam结合kegg官网htxt进行注释
用法:
python kofam2kegg.py kofam.out ath00001.keg my_kegg_output
code:
import sys
from collections import defaultdict
def parse_kofam_file(kofam_file):
ko_to_genes = defaultdict(list)
with open(kofam_file) as f:
for line in f:
parts = line.strip().split('\t')
if len(parts) == 2:
gene, ko = parts
ko_to_genes[ko].append(gene)
return ko_to_genes
def parse_keg_file(keg_file):
ko_to_pathway = defaultdict(list)
pathway_info = {}
level1 = level2 = pathway = ''
pathway_id = ''
with open(keg_file) as f:
for line in f:
line = line.strip()
if line.startswith('A'):
level1 = line[1:].strip()
elif line.startswith('B'):
level2 = line[1:].strip()
elif line.startswith('C'):
parts = line.split()
pathway = ' '.join(parts[1:-1])
pathway_id = parts[-1].split(':')[-1]
pathway_info[pathway_id] = {
'Pathway': pathway,
'Level1': level1,
'Level2': level2
}
elif line.startswith('D'):
parts = line.split('\t')
if len(parts) == 2:
ko = parts[1].split()[0]
ko_to_pathway[ko].append(pathway_id)
return ko_to_pathway, pathway_info
def main(kofam_file, keg_file, output_file):
ko_to_genes = parse_kofam_file(kofam_file)
ko_to_pathway, pathway_info = parse_keg_file(keg_file)
pathway_dict = defaultdict(lambda: {'genes': set(), 'kos': set()})
for ko, genes in ko_to_genes.items():
if ko in ko_to_pathway:
for pw_id in ko_to_pathway[ko]:
pathway_dict[pw_id]['genes'].update(genes)
pathway_dict[pw_id]['kos'].add(ko)
with open(output_file, 'w') as out:
out.write("Pathway\tGeneCount\tPathway ID\tLevel 1\tLevel 2\tKOs\tGenes\n")
for pw_id, data in pathway_dict.items():
info = pathway_info.get(pw_id, {})
out.write(f"{info.get('Pathway', '')}\t{len(data['genes'])}\tko{pw_id}\t"
f"{info.get('Level1', '')}\t{info.get('Level2', '')}\t"
f"{';'.join(sorted(data['kos']))}\t{';'.join(sorted(data['genes']))}\n")
if __name__ == "__main__":
if len(sys.argv) != 4:
print("用法: python 1.py kofam.out keg_file output_file")
else:
main(sys.argv[1], sys.argv[2], sys.argv[3])