Challenges in capturing the mycobiome from shotgun metagenome data: lack of software and databases | Microbiome | Full Text评价EukDetect比较准确
安装
https://github.com/allind/EukDetect
cd Software
git clone https://github.com/allind/EukDetect.git
cd EukDetect
# 下载 https://figshare.com/articles/dataset/Eukdetect_database/12670856/8?file=34880610
tar -xzvf eukdetect_database_v2.tar.gz
conda env update --name eukdetect -f environment.yml
conda activate eukdetect
# install eukdetect
python setup.py install
# 要测试您的安装,请编辑文件 tests/configfile_for_tests.yml ,输入安装目录的路径和 EukDetect 数据库的路径。
python tests/test_eukdetect.py
使用
#将 default_configfile.yml 复制到新建的 your_configfile.yml 。按照描述修改配置文件中的所有参数。
# gzip -dc test.fastq.gz | head -n 10000 | awk '{ if (NR%4==2){count++; bases += length}} END{printf "%3.0f\n", bases/count}' #可以用于确定readlen填多少长度
configfile.yml文件
#Default config file for eukdetect. Copy and edit for analysis
#Directory where EukDetect output should be written
output_dir: "/home/zhongpei/diarrhoea/xjs_FJ_metagenomic/drep_bin/all_bin/fungi/eukdetect/"
#Indicate whether reads are paired (true) or single (false)
paired_end: true
#filename excluding sample name. no need to edit if paired_end = false
fwd_suffix: "_clean_1.fastq.gz"
#filename excludign sample name. no need to edit if paired_end = false
rev_suffix: "_clean_2.fastq.gz"
#file name excluding sample name. no need to edit if paired_end = true
se_suffix: ".fastq.gz"
#length of your reads. pre-trimming reads not recommended
readlen: 150
#full path to directory with raw fastq files
fq_dir: "/home/zhongpei/diarrhoea/xjs_FJ_metagenomic/metaMIC_contigs"
#full path to folder with eukdetect database files
database_dir: "/home/zhongpei/hard_disk_sda2/zhongpei/Software/EukDetect/database/"
#name of database. Default is original genomes only database name
database_prefix: "ncbi_eukprot_met_arch_markers.fna"
#full path to eukdetect installation folder
eukdetect_dir: "/home/zhongpei/hard_disk_sda2/zhongpei/Software/EukDetect"
#list sample names here. fastqs must correspond to {samplename}{se_suffix} for SE reads or {samplename}{fwd_suffix} and {samplename}{rev_suffix} for PE
#each sample name should be preceded by 2 spaces and followed by a colon character
samples:
F1:
F2:
F3:
F4:
F5:
F6:
F7:
F8:
F9:
F10:
F11:
F12:
F13:
F14:
F15:
F16:
F17:
F18:
F19:
F20:
F21:
F22:
F23:
F24:
F25:
F26:
F27:
F28:
F29:
F30:
F31:
F32:
F33:
F34:
F35:
F36:
F37:
F38:
F39:
F40:
F41:
F42:
F43:
F44:
F45:
F46:
F47:
F48:
F49:
F50:
J1:
J2:
J3:
J4:
J5:
J6:
J7:
J8:
J9:
J10:
J11:
J12:
J13:
J14:
J15:
J16:
J17:
J18:
J19:
J20:
J21:
J22:
J23:
J24:
J25:
J26:
J27:
J28:
J29:
J30:
J31:
J32:
J33:
J34:
J35:
J36:
J37:
J38:
J39:
J40:
J41:
J42:
J43:
J44:
J45:
J46:
J47:
J48:
J49:
J50:
正式运行
eukdetect --mode runall --configfile ~/your_configfile.yml --cores 32
结果文件
*_filtered_hits_table.txt 是主要的结果文件
统计
#! /usr/bin/env python
#########################################################
# Simplified microbial analysis - generates presence/absence matrix only
# Columns: sample names, Rows: species, Cell: present=1, absent=0
import argparse
import os
import pandas as pd
def clean_percentage_columns(df):
"""Clean percentage columns"""
percentage_columns = ['Percent_observed_markers', 'Total_marker_coverage', 'Percent_identity']
for col in percentage_columns:
if col in df.columns:
if df[col].dtype == 'object':
df[col] = df[col].astype(str).str.replace('%', '').astype(float)
return df
# Argument parsing
parser = argparse.ArgumentParser(description='Generate microbial presence/absence matrix')
parser.add_argument('--work_path', '-p', help='Path containing result files')
parser.add_argument('--file_maker', '-m', nargs='+', help='File identifiers (separated by space)')
parser.add_argument('--output_name', '-o', help='Output filename prefix')
parser.add_argument('--percent_threshold', '-t', type=float, default=50.0,
help='Threshold for Percent_observed_markers (default: 50.0)')
args = parser.parse_args()
# Set working directory
os.chdir(args.work_path)
files = os.listdir(args.work_path)
# Filter matched files
ls = []
for file in files:
if all(maker in file for maker in args.file_maker):
ls.append(file)
ls.sort()
print(f"Found {len(ls)} matching files:")
for f in ls:
print(f" {f}")
if not ls:
print("No matching files found!")
exit(1)
# Collect all microbes passing threshold
all_microbes = set()
sample_microbe_dict = {}
print(f"\nProcessing files (threshold: >= {args.percent_threshold}%)...")
for file_name in ls:
try:
# Check file content
with open(file_name, 'r') as f:
first_line = f.readline().strip()
if "No taxa passing filter requirements" in first_line or "No taxa" in first_line:
print(f" {file_name}: No valid data - Skipped")
sample_microbe_dict[file_name] = set()
continue
# Read table
df = pd.read_csv(file_name, sep='\t')
if 'Name' not in df.columns or 'Percent_observed_markers' not in df.columns:
print(f" {file_name}: Missing required columns - Skipped")
sample_microbe_dict[file_name] = set()
continue
if len(df) == 0:
print(f" {file_name}: No data rows - Skipped")
sample_microbe_dict[file_name] = set()
continue
# Clean %
df = clean_percentage_columns(df)
# Filter by threshold
filtered_df = df[df['Percent_observed_markers'] >= args.percent_threshold]
microbes_in_sample = set(filtered_df['Name'].tolist())
sample_microbe_dict[file_name] = microbes_in_sample
all_microbes.update(microbes_in_sample)
print(f" {file_name}: {len(df)} records, {len(microbes_in_sample)} passed threshold")
except Exception as e:
print(f" {file_name}: Error occurred - {e} - Skipped")
sample_microbe_dict[file_name] = set()
continue
# Ensure all files have dictionary entry
for file_name in ls:
if file_name not in sample_microbe_dict:
sample_microbe_dict[file_name] = set()
print(f"\nTotal: {len(all_microbes)} unique microbes passed threshold")
if len(all_microbes) == 0:
print("No microbes passed the threshold!")
exit(1)
# Create presence/absence matrix
microbe_list = sorted(list(all_microbes))
sample_list = sorted(ls)
print(f"\nCreating matrix of size {len(microbe_list)} x {len(sample_list)}...")
matrix_data = []
for microbe in microbe_list:
row = [microbe]
for sample in sample_list:
row.append(1 if microbe in sample_microbe_dict[sample] else 0)
matrix_data.append(row)
columns = ['Microbe'] + sample_list
matrix_df = pd.DataFrame(matrix_data, columns=columns)
output_file = f'{args.output_name}_presence_matrix.txt'
matrix_df.to_csv(output_file, sep='\t', index=False)
print(f"\nMatrix saved to: {output_file}")
print(f"Matrix dimensions: {len(microbe_list)} microbes x {len(sample_list)} samples")
print(f"Matrix key: 1 = present and passed threshold, 0 = absent or below threshold")
# Show preview
print(f"\nMatrix preview (first 5 rows and columns):")
preview = matrix_df.iloc[:5, :6] if len(matrix_df.columns) > 6 else matrix_df.head()
print(preview.to_string(index=False))
print("\nAnalysis completed!")