scanpy和Seurat单细胞分析对比
发布日期:2024-06-30 22:58 点击次数:64
scanpy和seurat分析代码比拟
尝试把曾强健的单细胞seurat分析的代码更变成scanpy版块的,包括样品读取,质控,harmony去除批次效应,降维聚类,marker松懈。
seurat版块的R代码先望望seurat版块的R代码,内部调用了另外几个文献,lib.R,qc.R,harmony.R,check-all-markers.R,这几个文献昔时应该齐有共享过.scRNA_scripts文献夹的r代码齐在上头的百度云网盘畅达(https://pan.baidu.com/s/1MzfqW07P9ZqEA_URQ6rLbA?pwd=3heo)内部哦:
rm(list=ls())options(stringsAsFactors = F) source('scRNA_scripts/lib.R')###### step1:导入数据 ###### # 付费法子 800 元东谈主民币dir='GSE189357_RAW/output/' samples=list.files( dir )samples library(data.table)sceList = lapply(samples,function(pro){ # pro=samples[1] print(pro) sce=CreateSeuratObject(counts = Read10X(file.path(dir,pro) ) , project = pro , min.cells = 5, min.features = 300,) return(sce)})names(sceList) library(stringr)# samples=gsub('.txt.gz','',str_split(samples,'_',simplify = T)[,2])samplesnames(sceList) = samplessce.all <- merge(sceList[[1]], y= sceList[ -1 ] , add.cell.ids = samples) as.data.frame(sce.all@assays$RNA@counts[1:10, 1:2])head(sce.all@meta.data, 10)table(sce.all@meta.data$orig.ident) ###### step2:QC质控 ######dir.create("./1-QC")setwd("./1-QC")# 若是过滤的太狠,就需要去修改这个过滤代码source('../scRNA_scripts/qc.R')sce.all.filt = basic_qc(sce.all)print(dim(sce.all))print(dim(sce.all.filt))setwd('../')sp='human'###### step3: harmony整合多个单细胞样品 ######dir.create("2-harmony")getwd()setwd("2-harmony")source('../scRNA_scripts/harmony.R')# 默许 ScaleData 莫得添加"nCount_RNA", "nFeature_RNA"# 默许的sce.all.int = run_harmony(sce.all.filt)setwd('../')###### step4: 降维聚类分群和看符号基因库 ####### 原则上分裂率是需要我方肉眼判断,取决于个东谈主教养# 为了省力,咱们平直看 0.1和0.8即可table(Idents(sce.all.int))table(sce.all.int$seurat_clusters)table(sce.all.int$RNA_snn_res.0.1) table(sce.all.int$RNA_snn_res.0.8) getwd()dir.create('check-by-0.1')setwd('check-by-0.1')sel.clust = "RNA_snn_res.0.1"sce.all.int <- SetIdent(sce.all.int, value = sel.clust)table(sce.all.int@active.ident) source('../scRNA_scripts/check-all-markers.R')setwd('../') getwd()dir.create('check-by-0.8')setwd('check-by-0.8')sel.clust = "RNA_snn_res.0.8"sce.all.int <- SetIdent(sce.all.int, value = sel.clust)table(sce.all.int@active.ident) source('../scRNA_scripts/check-all-markers.R')setwd('../') getwd()dir.create('check-by-0.5')setwd('check-by-0.5')sel.clust = "RNA_snn_res.0.5"sce.all.int <- SetIdent(sce.all.int, value = sel.clust)table(sce.all.int@active.ident) source('../scRNA_scripts/check-all-markers.R')setwd('../') getwd()last_markers_to_checkscanpy版块的python代码
装配各式库的技艺,平直pip install scanpy即可
若是是在jupyter notebook或者jupyter lab里,需要在前边加上!,敕令行里就毋庸
!pip install scanpy
#装配导入需要的模块import scanpy as scimport numpy as np import pandas as pd from glob import globimport reimport seaborn as snsimport matplotlib.pyplot as plt from tqdm import tqdmimport osfrom collections import Counterimport timeimport cosg as cosg%matplotlib inline
# 运行计时a = time.perf_counter()
scanpy下的read_10x_mtx函数读取数据,传入文献夹旅途,保证文献夹下一经这三个文献即可
图片
轮回读取9个单细胞数据,用字典进行存储,key为样真名,value为scanpy读取后的对象用scanpy下的concat函数进行多个样本的归拢
scanpy对象结构参考下图
adata.var是活动gene,列为统计场所的矩阵,进行完各式分析之后,好像如下图,刚读完数据的技艺,只须行名,莫得列
图片
adata.obs是活动细胞,列为统计场所的矩阵图片
adata.uns内部存放的口角结构化的数据,齐是字典的面容图片
比如sample color,存放的是每个样品对应的样式,pca内部存放有降维后的主因素的信息,louvain 0.01_colors内部存放的便是0.01的分裂率下每个cluster对应的样式图片
adata.X内部存放的是抒发矩阵,活动cell,列为gene图片
### Step1 批量读取单细胞数据# 字符串前边加上r之后,旅途里的/和\就毋庸故意修改了path = r'F:\新建文献夹\2022-GSE189357-LUAD-单细胞-疾病发扬\GSE189357_RAW\output'files = glob(path + '\*')# tqdm用来泄露经过条,不加也不错data_dic = {}for i in tqdm(files): sample_name = i.split('\\')[-1] # cache=True 会存储读取文献技艺的缓存,下次再读取就很快了 adata = sc.read_10x_mtx(i,var_names='gene_symbols',cache=True) adata.var_names_make_unique() adata.obs_names_make_unique() data_dic[sample_name] = adata adata = sc.concat(data_dic,label='sample',axis=0)adata.obs_names_make_unique()os.getcwd()work_dir = r'F:新建文献夹/'os.chdir(work_dir)
qc部分,参考曾强健seurat里qc.R剧本写的
这里只磋商了human gene,齐是大写的
#盘算比例和设施质控 def basic_qc(adata): # 盘算线粒体基因比例 adata.var['mt'] = adata.var_names.str.startswith('MT-') sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) mt_gene = adata.var.index[adata.var_names.str.startswith('MT-')] print('线粒体gene:',mt_gene) # 盘算核糖体基因比例 adata.var['rp'] = [True if i.startswith('RPL') or i.startswith('RPS') else False for i in adata.var_names] sc.pp.calculate_qc_metrics(adata, qc_vars=['rp'], percent_top=None, log1p=False, inplace=True) rp_gene = [i for i in adata.var_names if i.startswith('RPL') or i.startswith('RPS')] print('核糖体gene:',rp_gene) # 盘算红血细胞基因比例 adata.var['hb'] = [True if i.startswith('HB') and not i.startswith('HBP') else False for i in adata.var_names] sc.pp.calculate_qc_metrics(adata, qc_vars=['hb'], percent_top=None, log1p=False, inplace=True) hb_gene = [i for i in adata.var_names if i.startswith('HB') and not i.startswith('HBP')] print('红血细胞gene:',hb_gene) # 可视化细胞各gene比例 if not os.path.isdir('qc'): os.mkdir('qc') sc.pl.violin(adata,keys=['n_genes_by_counts','total_counts'],groupby='sample',show = False) plt.savefig('qc/vlnplot1.pdf') sc.pl.violin(adata,keys=['pct_counts_mt','pct_counts_rp','pct_counts_hb'],groupby='sample',show = False) plt.savefig('qc/vlnplot2.pdf') sc.pl.scatter(adata,x='total_counts', y='n_genes_by_counts',color='sample',show = False) plt.savefig('qc/scatterplot.pdf') sc.pl.highest_expr_genes(adata,n_top=20,show = False) plt.savefig('qc/TOP20_most_expressed_gene.pdf') # 过滤 print('过滤前:',adata.shape) adata = adata[adata.obs.n_genes_by_counts < 6000, :] adata = adata[adata.obs.pct_counts_mt < 20, :] adata = adata[adata.obs.pct_counts_rp > 3, :] adata = adata[adata.obs.pct_counts_hb < 1, :] print('过滤后:',adata.shape) # 可视化过滤后的恶果 sc.pl.violin(adata,keys=['n_genes_by_counts','total_counts'],groupby='sample',show = False) plt.savefig('qc/vlnplot1_filtered.pdf') sc.pl.violin(adata,keys=['pct_counts_mt','pct_counts_rp','pct_counts_hb'],groupby='sample',show = False) plt.savefig('qc/vlnplot2_filtered.pdf') return adata
#harmony整合def run_harmony(adata): # normalize sc.pp.normalize_total(adata, target_sum=1e4) sc.pp.log1p(adata) adata.raw = adata # 找高变gene sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) # 只保留高变gene adata = adata[:, adata.var.highly_variable] # 凭据线粒体gene进行regress sc.pp.regress_out(adata, 'pct_counts_mt') # scale sc.pp.scale(adata) # pca sc.tl.pca(adata, svd_solver='arpack') sc.external.pp.harmony_integrate(adata,key='sample') sc.pp.neighbors(adata, n_neighbors=10, n_pcs=20,use_rep='X_pca_harmony') sc.tl.umap(adata) # 建立不同分裂率 for i in [0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1]: sc.tl.louvain(adata,resolution=i,key_added='louvain {}'.format(i)) if not os.path.isdir('harmony'): os.mkdir('harmony') sc.pl.umap(adata,color=['louvain 0.01','louvain 0.1','louvain 0.2'],show = False) plt.savefig('harmony/resolution_low.pdf') sc.pl.umap(adata,color=['louvain 0.5','louvain 0.8','louvain 1'],show = False) plt.savefig('harmony/resolution_high.pdf') return adata
def run_harmony(adata): # normalize sc.pp.normalize_total(adata, target_sum=1e4) sc.pp.log1p(adata) adata.raw = adata # 找高变gene sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) # 只保留高变gene adata = adata[:, adata.var.highly_variable] # 凭据线粒体gene进行regress sc.pp.regress_out(adata, 'pct_counts_mt') # scale sc.pp.scale(adata) # pca sc.tl.pca(adata, svd_solver='arpack') sc.external.pp.harmony_integrate(adata,key='sample') sc.pp.neighbors(adata, n_neighbors=10, n_pcs=20,use_rep='X_pca_harmony') sc.tl.umap(adata) # 建立不同分裂率 for i in [0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1]: sc.tl.louvain(adata,resolution=i,key_added='louvain {}'.format(i)) if not os.path.isdir('harmony'): os.mkdir('harmony') sc.pl.umap(adata,color=['louvain 0.01','louvain 0.1','louvain 0.2'],show = False) plt.savefig('harmony/resolution_low.pdf') sc.pl.umap(adata,color=['louvain 0.5','louvain 0.8','louvain 1'],show = False) plt.savefig('harmony/resolution_high.pdf') return adata恶果展示
adata = basic_qc(adata)
图片
图片
图片
adata = run_harmony(adata)
图片
check_markers(0.1)check_markers(0.3)check_markers(0.8)
图片
图片
运行完耗时1021s 本站仅提供存储劳动,通盘履行均由用户发布,如发现存害或侵权履行,请点击举报。