之前用Kraken2注释宏基因组的contig,发现只有30%左右可以被Kraken2注释
Kraken2+Bracken:宏基因组物种注释-CSDN博客
不信邪,再用NR库试试
参考:
将NR数据库diamond比对效果做物种注释_diamond 物种注释-CSDN博客
NR下载
- nohup wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz &
- wget -c https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz.md5
- #检查下载的完整性(一样的话就是完整了)
- md5sum nr.gz ### 899ac219cac213c60fede9c3d9ef8f7b nr.gz
- cat nr.gz.md5 ### 899ac219cac213c60fede9c3d9ef8f7b nr.gz
- nohup gunzip nr.gz &
- mv nr nr.fa
- ###下载NR物种相关信息和taxid信息
- wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz
- wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz.md5
- wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
- wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz.md5
- #检查完整性
- md5sum prot.accession2taxid.gz #32b8ca7ea712e161c72af69135fc938e
- cat prot.accession2taxid.gz.md5 #32b8ca7ea712e161c72af69135fc938e
- md5sum taxdump.tar.gz #26558b800bc1b3795c25d1f0ead65412
- cat taxdump.tar.gz.md5 #26558b800bc1b3795c25d1f0ead65412
- #解压
- tar -zxvf taxdump.tar.gz
- gunzip prot.accession2taxid.gz
- #移动需要的文件
- cd ~
- mkdir .taxonkit
- cp *.dmp ~/.taxonkit
复制代码 必要软件下载
- conda create -n NR_database_search
- conda activate NR_database_search
- conda install -c bioconda taxonkit=0.15.0
- conda install -c bioconda diamond=2.1.8
- conda install -c bioconda csvtk=0.28.0
复制代码 diamond建库
- nohup diamond makedb --threads 180 --in nr.fa --db NR_2023_07_23 &
复制代码 diamond比对
- diamond blastx --db NR_2023_07_23.dmnd --query nucleic_reads.fna\
- -o nucleic_matches_fmt6.txt --threads 180 --evalue 0.00001 \
- --max-target-seqs 5 --outfmt 6
- diamond blastp --db NR_2023_07_23.dmnd --query protein_reads.fna\
- -o protein_matches_fmt6.txt --threads 180 --evalue 0.00001 \
- --max-target-seqs 5 --outfmt 6
- ## --outfmt 6 最好别改变
复制代码 这些参数可以调整diamond比对的速度or准确性
这几个参数可以调整比对的coverage,identity,score
效果如下 !!!(这个表头后面python会加)
taxonkit得到物种分类信息表
感谢大佬:一文完成nt库序列快速下载及blast效果注释物种 (qq.com)
得到seqid注释之后,可以搜刮注释
- ## 一些主要的物种编号
- # 2 bacteria
- # 2157 archaea
- # 4751 fungi
- # 10239 virus
- # 2759 Eukaryota
- #看taxnokit安好了么
- taxonkit -h
- #创建目录
- cd /home/zhongpei/database/NR_2023_07_23
- mkdir /home/zhongpei/database/NR_2023_07_23/NCBI_Main_tax
- cd NCBI_Main_tax
- #开始
- taxonkit list -j 4 --ids 2,2157,4751,10239,2759 --indent "" > NCBI_Main.taxid.txt
- # -j 是线程,软件说4个够了;--ids 是需要的物种编号,用逗号分隔
- wc -l NCBI_Main.taxid.txt # 2708739 NCBI_Main.taxid.txt
- head -n 5 NCBI_Main.taxid.txt #查看内容
- # 提取taxid和taxonomy(界门纲目科属种)的对应信息到NCBI_Main.taxid.txt
- less NCBI_Main.taxid.txt | taxonkit reformat -I 1 -r Unassigned -f "{k}\t{p}\t{c}\t{o}\t{f}\t{g}\t{s}\t{t}"| sed '1i\Taxid\tKingdom\tPhylum\tClass\tOrder\tFamily\tGenus\tSpecies\tStrain' > NCBI_Main.taxid_new.txt
- # -I 1 一个制表符分隔;-r 没有找到的用什么字符去填充,这里用的“Unassigned”
- # -f 输出的格式;1i表示在第行之前插入文本(sed用法,不太会)
复制代码 完成!
vim NCBI_Main.taxid_new.txt
把第一个Taxid改成小写
seqid和taxid的对应
还记不记得第一步下载过一个 "prot.accession2taxid" ,现在要派上用场了
实在python也能做,csvtk太不熟悉了,先来学习一下吧,感觉还挺方便(这一步比较慢)
- cat prot.accession2taxid | csvtk -t grep -f taxid -P NCBI_Main.taxid.txt | csvtk -t cut -f accession.version,taxid > NCBI_seqid_taxid.txt
- # "cat prot.accession2taxid |" 把 prot.accession2taxid 的内容到下面的 csvtk
- # -t 输入内容是制表符分隔;grep 这是csvtk的1个子命令,用于在文件中搜索匹配的行
- # -P 搜索那些"taxid"字段的值出现在"NCBI_Main.taxid.txt"文件中的行
- # cut 这是csvtk的1个子命令,用于从输入中选择特定的字段
复制代码 NCBI_seqid_taxid.txt 就是目标文件
diamond seqid和taxid对应,再和界门纲目科属种对应
把diamond效果文件与NCBI_seqid_taxid.txt对应
- #!/home/zhongpei/miniconda3/envs/py39/bin/python3.9
- # ##########################################################
- # match diamond blast NR result.txt and NCBI_seqid_taxid.txt
- # written by PeiZhong in IFR of CAAS
- import os
- import argparse
- parser = argparse.ArgumentParser(description='match diamond blast NR result.txt and NCBI_seqid_taxid.txt. '
- '!!! all the file should end will .txt !!!')
- parser.add_argument('diamond_result_folder_path',help='full Path of the folder that contain your diamond result txt')
- parser.add_argument('result_files_mark',help='mark=The name of the mark specific to '
- 'your two-column diamond results file in this folder, e.g., clean, '
- 'for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')
- parser.add_argument('NCBI_acc_taxid',help='full Path that contain NCBI_seqid_taxid.txt')
- args = parser.parse_args()
- result_file_folder_path = args.diamond_result_folder_path
- NCBI_file = args.NCBI_acc_taxid
- file_mark = args.result_files_mark
- os.chdir(result_file_folder_path)
- files = os.listdir(result_file_folder_path)
- print(files)
- db={}
- print("start db read")
- f_table = open("%s" % (NCBI_file), 'r')
- print("start db build")
- for line in f_table.readlines():
- line=line.split('\t')
- acc_num = line[0].strip()
- tax_num = line[1].strip()
- db[acc_num] = tax_num
- print("finish db build")
- file_ls = []
- for result_file in files:
- if file_mark in result_file:
- file_ls.append(result_file)
- file_ls.sort()
- print(file_ls)
- header = "qseqid\taccession.version\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"
- for result_file in file_ls:
- print(result_file)
- f_result = open("%s" % (result_file), 'r+')
- content = f_result.read()
- f_result.seek(0, 0)
- f_result.write(header.rstrip('\r\n') + '\n' + content)
- f_result.close()
- out_name = str(result_file).strip('txt')
- out_name = out_name+'_taxid.txt'
- f_result = open("%s" % (result_file), 'r')
- f_out = open("%s" % (out_name), 'a')
- for line in f_result.readlines():
- line = line.split('\t')
- query_num = line[0].strip()
- acc_q_num = line[1].strip()
- if acc_q_num in db:
- print(query_num,end="\t",file=f_out)
- print(acc_q_num, end="\t", file=f_out)
- print(db[acc_q_num], file=f_out)
- f_out.close()
- f_result.close()
- f_table.close()
复制代码
- chmod +x diamond_NR_tax.py(完整地址)
- diamond_NR_tax.py(完整地址) -h
- diamond_NR_tax.py(完整地址) diamond_result_folder_path result_files_mark NCBI_acc_taxid
- diamond_result_folder_path:你存放上面处理完的diamond比对文件,txt结尾的文件,的目录(完整地址)
- result_files_mark:这个地址中,你的这些文件独有的标识字符串
- #for example,the mark of result file 'SY10_clean.txt' and 'SY11_clean.txt' is 'clean'
- NCBI_acc_taxid:你的NCBI_seqid_taxid.txt文件的完整地址
复制代码
做完效果是如许的
接下来再写个python代码根据taxid把这个文件和界门纲目科属种接洽起来就行(不美意思只会python,我检讨。。。。但是python简单呀)
- #!/home/zhongpei/miniconda3/envs/py39/bin/python3.9
- # ##########################################################
- # match diamond blast NR taxid result.txt and NCBI_Main.taxid_new.txt
- # written by PeiZhong in IFR of CAAS
- import os
- import argparse
- parser = argparse.ArgumentParser(description='match diamond blast NR taxid result.txt and NCBI_Main.taxid_new.txt. '
- '!!! all the file should end will .txt !!!')
- parser.add_argument('diamond_result_folder_path',help='full Path of the folder that contain your result txt')
- parser.add_argument('result_files_mark',help='mark=The name of the mark specific to '
- 'your two-column diamond results file in this folder, e.g., clean, '
- 'for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')
- parser.add_argument('NCBI_taxid_tax',help='full Path that contain NCBI_Main.taxid_new.txt')
- args = parser.parse_args()
- result_file_folder_path = args.diamond_result_folder_path
- NCBI_file = args.NCBI_taxid_tax
- file_mark = args.result_files_mark
- os.chdir(result_file_folder_path)
- files = os.listdir(result_file_folder_path)
- print(files)
- db={}
- print("start db read")
- f_table = open("%s" % (NCBI_file), 'r')
- print("start db build")
- for line in f_table.readlines():
- line=line.split('\t')
- taxid = line[0].strip()
- tax_anno = line[1:8]
- db[taxid] = tax_anno
- print("finish db build")
- file_ls = []
- for result_file in files:
- if file_mark in result_file:
- file_ls.append(result_file)
- file_ls.sort()
- print(file_ls)
- for result_file in file_ls:
- print(result_file)
- out_name = str(result_file).strip('txt')
- out_name = out_name+'_tax.txt'
- f_result = open("%s" % (result_file), 'r')
- f_out = open("%s" % (out_name), 'a')
- for line in f_result.readlines():
- line = line.split('\t')
- query_num = line[0].strip()
- acc_q_num = line[1].strip()
- taxid_1 = line[2].strip()
- if taxid_1 in db:
- print(query_num,end="\t",file=f_out)
- print(acc_q_num, end="\t", file=f_out)
- print(taxid_1, end="\t", file=f_out)
- tax_in_db = db[taxid_1]
- str_ls = map(str, tax_in_db)
- tax = '\t'.join(str_ls)
- print(tax, file=f_out)
- f_out.close()
- f_result.close()
- f_table.close()
复制代码
和上面一样也需要给权限
好了,可以交差了 !我公布python是我们初学者的yyds
再把多个样本的效果结合到一起成为表格
- #!/home/zhongpei/miniconda3/envs/py39/bin/python3.9
- # ##########################################################
- # match diamond blast NR result.txt and NCBI_seqid_taxid.txt
- # written by PeiZhong in IFR of CAAS
- import argparse
- import os
- import pandas as pd
- import csv
- parser = argparse.ArgumentParser(description='tax files combine')
- parser.add_argument('tax_result_folder_path',help='full Path of the folder that contain your tax result txt')
- parser.add_argument('result_files_mark',help='mark=The name of the mark specific to '
- 'your two-column diamond results file in this folder, e.g., clean, '
- 'for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')
- args = parser.parse_args()
- result_file_folder_path = args.tax_result_folder_path
- file_mark = args.result_files_mark
- path = os.chdir(result_file_folder_path)
- files = os.listdir(result_file_folder_path)
- file_ls = []
- for file in files:
- if file_mark in file:
- file_ls.append(file)
- file_ls.sort()
- print(file_ls)
- for file in file_ls:
- df = pd.read_csv('%s' % (file),sep='\t')
- df.drop_duplicates(subset='qseqid', keep='first', inplace=True)
- outname = str(file).rsplit(".", 1)[0]
- df.to_csv(outname+'_only1.txt', index=False, sep='\t')
- path = os.chdir(result_file_folder_path)
- files = os.listdir(result_file_folder_path)
- file2_ls = []
- for file2 in files:
- if "only1" in file2:
- file2_ls.append(file2)
- file2_ls.sort()
- print(file2_ls)
- def tax_finder(file):
- tax_ls = []
- with open(file, 'r') as f:
- for line in f.readlines():
- if "qseqid" not in line:
- tax = ';'.join(line.split('\t')[3:9])
- tax_ls.append(tax)
- db = {}
- for i in tax_ls:
- if i in db:
- db[i] += 1
- if i not in db:
- db[i] = 1
- output = str(file).rsplit(".",1)[0]+"_count1.txt"
- with open(output,"a") as f3:
- for key, value in db.items():
- key = key.strip()
- print(key, end="\t", file=f3)
- print(value, file=f3)
- for file in file2_ls:
- tax_finder(file)
- path = os.chdir(result_file_folder_path)
- files = os.listdir(result_file_folder_path)
- file3_ls = []
- for file3 in files:
- if "count1" in file3:
- file3_ls.append(file3)
- file3_ls.sort()
- print(file3_ls)
- all_tax = {}
- for file in file3_ls:
- with open("%s" % (file),"r") as f4:
- for line in f4.readlines():
- tax = line.split('\t')[0]
- all_tax[tax] = 0
- print(all_tax)
- ls_db_result = []
- for file in file3_ls:
- with open("%s" % (file),"r") as f5:
- db_name = str(file)
- db_name = all_tax.copy()
- for line in f5.readlines():
- tax = line.split('\t')[0]
- count = line.strip("\n").split('\t')[1]
- if tax in db_name:
- db_name[tax] = count
- ls_db_result.append(db_name)
- header=[]
- for key in all_tax.keys():
- header.append(key)
- with open('merge_overview_2.csv', 'a', encoding='utf-8', newline='') as f7:
- dictWriter = csv.DictWriter(f7,header)
- dictWriter.writeheader()
- dictWriter.writerows(ls_db_result)
- f7_rows=[]
- with open('merge_overview_2.csv',"r") as f7:
- for line in f7.readlines():
- f7_rows.append(line)
- with open('merge_overview_3.csv',"a") as f8:
- print("",end=",",file=f8)
- print(f7_rows[0].strip("\n"),file=f8)
- row_count=0
- for i in file3_ls:
- row_count += 1
- print(i,end=",",file=f8)
- print(f7_rows[row_count].strip("\n"),file=f8)
复制代码
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |