使用Diamond比对NR数据库获取物种注释
之前用Kraken2注释宏基因组的contig,发现只有30%左右可以被Kraken2注释Kraken2+Bracken:宏基因组物种注释-CSDN博客
不信邪,再用NR库试试
参考:
将NR数据库diamond比对效果做物种注释_diamond 物种注释-CSDN博客
NR下载
nohup wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz &
wget -c https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz.md5
#检查下载的完整性(一样的话就是完整了)
md5sum nr.gz### 899ac219cac213c60fede9c3d9ef8f7bnr.gz
cat nr.gz.md5 ### 899ac219cac213c60fede9c3d9ef8f7bnr.gz
nohup gunzipnr.gz&
mv nrnr.fa
###下载NR物种相关信息和taxid信息
wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz
wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz.md5
wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz.md5
#检查完整性
md5sum prot.accession2taxid.gz#32b8ca7ea712e161c72af69135fc938e
cat prot.accession2taxid.gz.md5 #32b8ca7ea712e161c72af69135fc938e
md5sum taxdump.tar.gz #26558b800bc1b3795c25d1f0ead65412
cat taxdump.tar.gz.md5 #26558b800bc1b3795c25d1f0ead65412
#解压
tar -zxvf taxdump.tar.gz
gunzip prot.accession2taxid.gz
#移动需要的文件
cd ~
mkdir .taxonkit
cp *.dmp ~/.taxonkit
必要软件下载
conda create -n NR_database_search
conda activate NR_database_search
conda install -c bioconda taxonkit=0.15.0
conda install -c bioconda diamond=2.1.8
conda install -c bioconda csvtk=0.28.0diamond建库
nohup diamond makedb --threads 180 --in nr.fa --db NR_2023_07_23 &diamond比对
diamond blastx --db NR_2023_07_23.dmnd --query nucleic_reads.fna\
-o nucleic_matches_fmt6.txt --threads 180 --evalue 0.00001 \
--max-target-seqs 5 --outfmt 6
diamond blastp --db NR_2023_07_23.dmnd --query protein_reads.fna\
-o protein_matches_fmt6.txt --threads 180 --evalue 0.00001 \
--max-target-seqs 5 --outfmt 6
## --outfmt 6 最好别改变 这些参数可以调整diamond比对的速度or准确性
https://img-blog.csdnimg.cn/4cdad6e05bc443d5b9452c1c28a71999.png
这几个参数可以调整比对的coverage,identity,score
https://img-blog.csdnimg.cn/6ab242c4b14d49f2be7bb9fd3202c18b.png
效果如下 !!!(这个表头后面python会加)
https://img-blog.csdnimg.cn/cb2c0835a957456a838ca3776990307e.png
taxonkit得到物种分类信息表
感谢大佬:一文完成nt库序列快速下载及blast效果注释物种 (qq.com)
得到seqid注释之后,可以搜刮注释
## 一些主要的物种编号
# 2 bacteria
# 2157archaea
# 4751fungi
# 10239 virus
# 2759 Eukaryota
#看taxnokit安好了么
taxonkit -h
#创建目录
cd /home/zhongpei/database/NR_2023_07_23
mkdir /home/zhongpei/database/NR_2023_07_23/NCBI_Main_tax
cd NCBI_Main_tax
#开始
taxonkit list -j 4 --ids 2,2157,4751,10239,2759 --indent "" > NCBI_Main.taxid.txt
# -j 是线程,软件说4个够了;--ids 是需要的物种编号,用逗号分隔
wc -l NCBI_Main.taxid.txt # 2708739 NCBI_Main.taxid.txt
head -n 5 NCBI_Main.taxid.txt #查看内容
# 提取taxid和taxonomy(界门纲目科属种)的对应信息到NCBI_Main.taxid.txt
less NCBI_Main.taxid.txt | taxonkit reformat -I 1 -r Unassigned -f "{k}\t{p}\t{c}\t{o}\t{f}\t{g}\t{s}\t{t}"| sed '1i\Taxid\tKingdom\tPhylum\tClass\tOrder\tFamily\tGenus\tSpecies\tStrain' > NCBI_Main.taxid_new.txt
# -I 1 一个制表符分隔;-r 没有找到的用什么字符去填充,这里用的“Unassigned”
# -f 输出的格式;1i表示在第行之前插入文本(sed用法,不太会)
完成!
https://img-blog.csdnimg.cn/34e0c52efed944fb81075934fea8a8de.jpeg
vim NCBI_Main.taxid_new.txt
把第一个Taxid改成小写
seqid和taxid的对应
还记不记得第一步下载过一个 "prot.accession2taxid" ,现在要派上用场了
实在python也能做,csvtk太不熟悉了,先来学习一下吧,感觉还挺方便(这一步比较慢)
cat prot.accession2taxid | csvtk -t grep -f taxid -P NCBI_Main.taxid.txt | csvtk -t cut -f accession.version,taxid > NCBI_seqid_taxid.txt
# "cat prot.accession2taxid |" 把 prot.accession2taxid 的内容到下面的 csvtk
# -t 输入内容是制表符分隔;grep 这是csvtk的1个子命令,用于在文件中搜索匹配的行
# -P 搜索那些"taxid"字段的值出现在"NCBI_Main.taxid.txt"文件中的行
# cut 这是csvtk的1个子命令,用于从输入中选择特定的字段
NCBI_seqid_taxid.txt 就是目标文件
https://img-blog.csdnimg.cn/8860a202cc37431d9486b84b4e0f77b5.png
diamond seqid和taxid对应,再和界门纲目科属种对应
把diamond效果文件与NCBI_seqid_taxid.txt对应
#!/home/zhongpei/miniconda3/envs/py39/bin/python3.9
# ##########################################################
# match diamond blast NR result.txt and NCBI_seqid_taxid.txt
# written by PeiZhong in IFR of CAAS
import os
import argparse
parser = argparse.ArgumentParser(description='match diamond blast NR result.txt and NCBI_seqid_taxid.txt. '
'!!! all the file should end will .txt !!!')
parser.add_argument('diamond_result_folder_path',help='full Path of the folder that contain your diamond result txt')
parser.add_argument('result_files_mark',help='mark=The name of the mark specific to '
'your two-column diamond results file in this folder, e.g., clean, '
'for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')
parser.add_argument('NCBI_acc_taxid',help='full Path that contain NCBI_seqid_taxid.txt')
args = parser.parse_args()
result_file_folder_path = args.diamond_result_folder_path
NCBI_file = args.NCBI_acc_taxid
file_mark = args.result_files_mark
os.chdir(result_file_folder_path)
files = os.listdir(result_file_folder_path)
print(files)
db={}
print("start db read")
f_table = open("%s" % (NCBI_file), 'r')
print("start db build")
for line in f_table.readlines():
line=line.split('\t')
acc_num = line.strip()
tax_num = line.strip()
db = tax_num
print("finish db build")
file_ls = []
for result_file in files:
if file_mark in result_file:
file_ls.append(result_file)
file_ls.sort()
print(file_ls)
header = "qseqid\taccession.version\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"
for result_file in file_ls:
print(result_file)
f_result = open("%s" % (result_file), 'r+')
content = f_result.read()
f_result.seek(0, 0)
f_result.write(header.rstrip('\r\n') + '\n' + content)
f_result.close()
out_name = str(result_file).strip('txt')
out_name = out_name+'_taxid.txt'
f_result = open("%s" % (result_file), 'r')
f_out = open("%s" % (out_name), 'a')
for line in f_result.readlines():
line = line.split('\t')
query_num = line.strip()
acc_q_num = line.strip()
if acc_q_num in db:
print(query_num,end="\t",file=f_out)
print(acc_q_num, end="\t", file=f_out)
print(db, file=f_out)
f_out.close()
f_result.close()
f_table.close()https://img-home.csdnimg.cn/images/20230724024159.png?origin_url=data%3Aimage%2Fgif%3Bbase64%2CR0lGODlhAQABAPABAP%2F%2F%2FwAAACH5BAEKAAAALAAAAAABAAEAAAICRAEAOw%3D%3D&pos_id=lSc39Tbv
chmod +x diamond_NR_tax.py(完整地址)
diamond_NR_tax.py(完整地址) -h
diamond_NR_tax.py(完整地址) diamond_result_folder_path result_files_mark NCBI_acc_taxid
diamond_result_folder_path:你存放上面处理完的diamond比对文件,txt结尾的文件,的目录(完整地址)
result_files_mark:这个地址中,你的这些文件独有的标识字符串
#for example,the mark of result file 'SY10_clean.txt' and 'SY11_clean.txt' is 'clean'
NCBI_acc_taxid:你的NCBI_seqid_taxid.txt文件的完整地址https://img-home.csdnimg.cn/images/20230724024159.png?origin_url=data%3Aimage%2Fgif%3Bbase64%2CR0lGODlhAQABAPABAP%2F%2F%2FwAAACH5BAEKAAAALAAAAAABAAEAAAICRAEAOw%3D%3D&pos_id=NLg5k6zv
做完效果是如许的
https://img-blog.csdnimg.cn/3b6742277ee64b889df299b95e182c17.png
https://img-blog.csdnimg.cn/3fcbc5c1129249da93fd679496e5f408.png
接下来再写个python代码根据taxid把这个文件和界门纲目科属种接洽起来就行(不美意思只会python,我检讨。。。。但是python简单呀)
#!/home/zhongpei/miniconda3/envs/py39/bin/python3.9
# ##########################################################
# match diamond blast NR taxid result.txt and NCBI_Main.taxid_new.txt
# written by PeiZhong in IFR of CAAS
import os
import argparse
parser = argparse.ArgumentParser(description='match diamond blast NR taxid result.txt and NCBI_Main.taxid_new.txt. '
'!!! all the file should end will .txt !!!')
parser.add_argument('diamond_result_folder_path',help='full Path of the folder that contain your result txt')
parser.add_argument('result_files_mark',help='mark=The name of the mark specific to '
'your two-column diamond results file in this folder, e.g., clean, '
'for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')
parser.add_argument('NCBI_taxid_tax',help='full Path that contain NCBI_Main.taxid_new.txt')
args = parser.parse_args()
result_file_folder_path = args.diamond_result_folder_path
NCBI_file = args.NCBI_taxid_tax
file_mark = args.result_files_mark
os.chdir(result_file_folder_path)
files = os.listdir(result_file_folder_path)
print(files)
db={}
print("start db read")
f_table = open("%s" % (NCBI_file), 'r')
print("start db build")
for line in f_table.readlines():
line=line.split('\t')
taxid = line.strip()
tax_anno = line
db = tax_anno
print("finish db build")
file_ls = []
for result_file in files:
if file_mark in result_file:
file_ls.append(result_file)
file_ls.sort()
print(file_ls)
for result_file in file_ls:
print(result_file)
out_name = str(result_file).strip('txt')
out_name = out_name+'_tax.txt'
f_result = open("%s" % (result_file), 'r')
f_out = open("%s" % (out_name), 'a')
for line in f_result.readlines():
line = line.split('\t')
query_num = line.strip()
acc_q_num = line.strip()
taxid_1 = line.strip()
if taxid_1 in db:
print(query_num,end="\t",file=f_out)
print(acc_q_num, end="\t", file=f_out)
print(taxid_1, end="\t", file=f_out)
tax_in_db = db
str_ls = map(str, tax_in_db)
tax = '\t'.join(str_ls)
print(tax, file=f_out)
f_out.close()
f_result.close()
f_table.close()https://img-home.csdnimg.cn/images/20230724024159.png?origin_url=data%3Aimage%2Fgif%3Bbase64%2CR0lGODlhAQABAPABAP%2F%2F%2FwAAACH5BAEKAAAALAAAAAABAAEAAAICRAEAOw%3D%3D&pos_id=8ap3M0w6
和上面一样也需要给权限
https://img-blog.csdnimg.cn/d8d467142b804527b67d6d4df2a0864b.png
好了,可以交差了 !我公布python是我们初学者的yyds
https://img-blog.csdnimg.cn/9bf0df81f3af4e6cba7016cc59908c97.png
再把多个样本的效果结合到一起成为表格
#!/home/zhongpei/miniconda3/envs/py39/bin/python3.9
# ##########################################################
# match diamond blast NR result.txt and NCBI_seqid_taxid.txt
# written by PeiZhong in IFR of CAAS
import argparse
import os
import pandas as pd
import csv
parser = argparse.ArgumentParser(description='tax files combine')
parser.add_argument('tax_result_folder_path',help='full Path of the folder that contain your tax result txt')
parser.add_argument('result_files_mark',help='mark=The name of the mark specific to '
'your two-column diamond results file in this folder, e.g., clean, '
'for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')
args = parser.parse_args()
result_file_folder_path = args.tax_result_folder_path
file_mark = args.result_files_mark
path = os.chdir(result_file_folder_path)
files = os.listdir(result_file_folder_path)
file_ls = []
for file in files:
if file_mark in file:
file_ls.append(file)
file_ls.sort()
print(file_ls)
for file in file_ls:
df = pd.read_csv('%s' % (file),sep='\t')
df.drop_duplicates(subset='qseqid', keep='first', inplace=True)
outname = str(file).rsplit(".", 1)
df.to_csv(outname+'_only1.txt', index=False, sep='\t')
path = os.chdir(result_file_folder_path)
files = os.listdir(result_file_folder_path)
file2_ls = []
for file2 in files:
if "only1" in file2:
file2_ls.append(file2)
file2_ls.sort()
print(file2_ls)
def tax_finder(file):
tax_ls = []
with open(file, 'r') as f:
for line in f.readlines():
if "qseqid" not in line:
tax = ';'.join(line.split('\t'))
tax_ls.append(tax)
db = {}
for i in tax_ls:
if i in db:
db += 1
if i not in db:
db = 1
output = str(file).rsplit(".",1)+"_count1.txt"
with open(output,"a") as f3:
for key, value in db.items():
key = key.strip()
print(key, end="\t", file=f3)
print(value, file=f3)
for file in file2_ls:
tax_finder(file)
path = os.chdir(result_file_folder_path)
files = os.listdir(result_file_folder_path)
file3_ls = []
for file3 in files:
if "count1" in file3:
file3_ls.append(file3)
file3_ls.sort()
print(file3_ls)
all_tax = {}
for file in file3_ls:
with open("%s" % (file),"r") as f4:
for line in f4.readlines():
tax = line.split('\t')
all_tax = 0
print(all_tax)
ls_db_result = []
for file in file3_ls:
with open("%s" % (file),"r") as f5:
db_name = str(file)
db_name = all_tax.copy()
for line in f5.readlines():
tax = line.split('\t')
count = line.strip("\n").split('\t')
if tax in db_name:
db_name = count
ls_db_result.append(db_name)
header=[]
for key in all_tax.keys():
header.append(key)
with open('merge_overview_2.csv', 'a', encoding='utf-8', newline='') as f7:
dictWriter = csv.DictWriter(f7,header)
dictWriter.writeheader()
dictWriter.writerows(ls_db_result)
f7_rows=[]
with open('merge_overview_2.csv',"r") as f7:
for line in f7.readlines():
f7_rows.append(line)
with open('merge_overview_3.csv',"a") as f8:
print("",end=",",file=f8)
print(f7_rows.strip("\n"),file=f8)
row_count=0
for i in file3_ls:
row_count += 1
print(i,end=",",file=f8)
print(f7_rows.strip("\n"),file=f8)
https://img-blog.csdnimg.cn/5ab6a837fa744544b87ce3afb215d83c.png
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。
页:
[1]