# Human Xenium Lung Dataset ## Imputation The ST dataset can be downloaded from https://www.10xgenomics.com/datasets/ffpe-human-lung-cancer-data-with-human-immuno-oncology-profiling-panel-and-custom-add-on-1-standard. The corresponding scRNA-seq dataset can be downloaded from https://cells.ucsc.edu/. ``` import pandas as pd import numpy as np import scanpy as sc import random import os, sys sys.path.append('./pasta') import __init__ import _version import optimizer import mapper import utils # ST data preprocess mat = pd.read_csv("gene_expression_lung.csv") # the gene expression file from ST data mat.index = mat.iloc[\:, 0] mat_f = mat.drop("Unnamed: 0", axis=1) mat_f = mat_f.T # get the centroid for the cells coords = pd.read_csv("centroid.csv") # the centroid file from ST data coords_f = coords.loc[coords["cell"].isin(mat_f.index), :] coords_f.index = coords_f["cell"] coords_f = coords_f.reindex(mat_f.index) # get the cluster file for the ST data # we used the kmeans 7 clusters provided by the dataset cluster = pd.read_csv("clusters.csv") cluster.index = cluster["Barcode"] cluster_f = cluster.loc[cluster["Barcode"].isin(mat_f.index), :] # read the scRNA-seq data # assume it is in the folder named as "SCdata" ad = sc.read_text("./SCdata/exprMatrix.tsv.gz") meta = pd.read_csv("./SCdata/meta.tsv", sep="\t") ad.var = meta sc_df = ad.to_df() sc_df.index = sc_df.index.str.split("|").str[0] sc_df.columns = ad.var["cellId"] # randomly select 3000 sc for training to shorter training time idx = random.sample(range(sc_df.shape[1]), 3000) sc_df_f = sc_df.iloc[:, idx] sc_df_f = sc_df_f.T # then we can get pathway # the pathway can be downloaded from https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp # for this ST data, we used hallmark pathways. # for the imputation, we just need the list of genes of the pathway # change the dataframe to anndata format sp_adata = sc.AnnData(mat_f) sc_adata = sc.AnnData(sc_df_f) # filter the low expressed genes sc.pp.filter_genes(sp_adata, min_cells=1) sc.pp.filter_genes(sc_adata, min_cells=1) # train the model mapper.pp_adatas(sc_adata, sp_adata) map = mapper.mapping(sc_adata, sp_adata, pthw_genes, sp_coords=coords, ncell_thres=10, sp_celltypes=cluster["Cluster"], lambda_g2=2, num_epochs=500) # get the predictions pthw_exp = utils.project_genes(adata_map=map, adata_sc=sc_adata, pthw=genes) # we can save the predictions as csv file for the downstream analysis d = pd.DataFrame([pthw_exp.values, coords_f.loc[:, "x"], coords_f.loc[:, "y"]]) d = d.T d.columns = ["pthw_exp", "x", "y"] d.to_csv("./predicted.csv") ``` ## Pathway enrichment (run in R) ``` library(dplyr) path <- "path_to_imputed_pathway_folder" # the folder contains the CSV files for the pathways (pathway1.csv, pathway2.csv, ...) f <- list.files(path) # read the cell type file celltype <- read.csv("celltype.csv") # change to your own cell type file celltype_table <- table(celltype$cell_type) # a table to see cell count for each cell type test <- NULL for(i in seq(f_immun)){ wilc_test <- NULL for(j in seq(nrow(cell_table))){ d <- read.csv(paste0(path, f[i])) # this read the imputed pathway d$cell_type <- celltype$cell_type d$group <- ifelse(d$Cell_type == cell_table$Var1[j], 1, 2) wilc_test[[j]] <- wilcox.test(X0 ~ group, data=d, alternative="greater") } test[[i]] <- wilc_test } test_table <- NULL for(i in seq(test)){ target <- test[[i]] target <- lapply(target, function(y) data.frame(wilcox_stat = y$statistic, wilcox_p = y$p.value)) target <- do.call(rbind, target) target$cell_type <- cell_table$Var1 target$pathway <- gsub(".csv", "", f_immun[i]) test_table[[i]] <- target } test_table <- do.call(rbind, test_table) test_table[test_table$wilcox_p >= 0.05, ]$wilcox_stat <- NA # generate the plot ggplot(test_table, aes(celltype, pathway, size=wilcox_stat, color=celltype)) + geom_point() + theme_classic() + scale_color_npg() + labs(x="Cell type") + theme(legend.position="none", axis.text.x=element_text(angle=30, hjust=1), axis.text.y=element_text(size=8), axis.title.y=element_blank()) ``` ## Apply IRIS (run in R) ``` library(IRIS) library(data.table) # read the ST and SC expression matrix st_d <- read.csv("path_to_ST_data") st_coord <- read.csv("path_to_ST_data_coordinates") sc_d <- read.csv("path_to_SC_data") sc_meta <- read.csv("path_to_SC_data_meta") # need cell type information and suject information # set up inputs for iris rownames(st_d) <- paste0(coord$centroidX, "x", coord$centroidY) # change cell index st_d <- t(st_d) rownames(st_coord) <- paste0(st_coord$centroidX, "x", st_coord$centroidY) colnames(st_coord) <- c("x", "y") sc_meta <- sc_meta[, c("name", "class", "donor")] # cell name, cell type, and sample name for each cell rownames(sc_meta) <- sc_meta$sample_name sp_count_list <- list(sp=st_d) sp_coord_list <- list(sp=st_coord) sc_d <- as.matrix(sc_d) IRIS_object <- createIRISObject(spatial_countMat_list=sp_count_list, spatial_location_list=sp_coord_list, sc_count=sc_d, sc_meta=sc_meta, ct.varname="class", sample.varname="donor", minCountGene=10, minCountSpot=3) numCluster = 2 #### perform the integrative and refernce-informed domain detection IRIS_object <- IRIS_spatial(IRIS_object,numCluster = numCluster) colors <- c("#ebe5c2", "#D57358") # extract the domain labels detected by IRIS IRIS_domain <- IRIS_object@spatialDomain[,c("Slice","spotName","IRIS_domain")] # extract the spatial location information spatial_location <- IRIS_object@spatialDomain[,c("x","y")] spatial_location$x <- IRIS_object@spatialDomain$y spatial_location$y <- -IRIS_object@spatialDomain$x IRIS.visualize.domain(IRIS_domain, spatial_location, colors=colors, numCols=1) ``` # Mefish Mouse Brain Dataset ## Imputation The ST dataset can be downloaded from https://alleninstitute.github.io/abc\_atlas\_access/descriptions/Zhuang-ABCA-3.html. A detailed download instructions is provided. The example session is 'Zhuang-ABCA-3': 'Zhuang-ABCA-3.010'. And we included the celltype (class) with more than 40 cells. A notebook of how we get the data can be found at our GitHub repo. The scRNA-seq dataset can be downloaded from https://portal.brain-map.org/atlases-and-data/rnaseq/mouse-v1-and-alm-smart-seq (Primary Visual Cortex (VISp)). ``` # the following script is written in py3 import pandas as pd import numpy as np import scanpy as sc import random import os, sys sys.path.append('./pasta') import __init__ import _version import optimizer import mapper import utils # load the sp data gene = pd.read_csv("../merfish_zhang/gene.csv") sp = sc.read_h5ad("../merfish_zhang/abca3.h5ad") mat = pd.DataFrame(sp.X) mat.index = sp.obs.index mat.columns = gene["gene_symbol"] # get the coords dataframe coords = sp.obs.iloc[:, 6:16] # load the scRNA-seq sc_df = pd.read_csv("./mouse_VISp_2018-06-14_intron-matrix.csv") # downloaded from Allen institute sc_gene = pd.read_csv("./mouse_VISp_2018-06-14_genes-rows.csv") sc_df.index = sc_gene["gene_symbol"] sc_df = sc_df.drop("Unnamed: 0", axis=1) sc_df = sc_df.T # select cells so that we can shorter the training time random.seed(2333) idx = random.sample(range(sc_df.shape[0]), 3000) sc_df_f = sc_df.iloc[idx, :] sp_adata = sc.AnnData(mat) sc_adata = sc.AnnData(sc_df_f) sc.pp.filter_genes(sp_adata, min_cells=1) sc.pp.filter_genes(sc_adata, min_cells=20) # then we can get pathway # the pathway can be downloaded from https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp # for this ST data, we used GOBP pathways which name contains brain. # for the imputation, we just need the list of genes of the pathway # train the model mapper.pp_adatas(sc_adata, sp_adata) map = mapper.mapping(sc_adata, sp_adata, pthw_genes, sp_coords=coords[["x", "y"]], ncell_thres=10, sp_celltypes=coords["class"], lambda_g2=2, num_epochs=500) # get the predictions pthw_exp = utils.project_genes(adata_map=map, adata_sc=sc_adata, pthw=pthw_genes) # we can save the predictions as csv file for the downstream analysis d = pd.DataFrame([pthw_exp.values, coords.loc[:, "x"], coords.loc[:, "y"]]) d = d.T d.columns = ["pthw_exp", "x", "y"] d.to_csv("./predicted.csv") ``` ## Cluster using the pathway matrix ``` import pandas as pd import scanpy as sc import os from sklearn.cluster import KMeans path = "path_to_pathway_expression_folder" f = os.listdir(path) data_l = [] for file in f: d = pd.read_csv(path + file) d.index = d["cell_label"] d = d.drop("cell_label", axis=1) d_sum = d.sum(axis=1) data_l.append(d_sum) data = pd.DataFrame(data_l) data = data.T kmeans = KMeans(n_clusters=2) kmeans.fit(data) label_k2 = kmeans.labels_ cluster = pd.DataFrame([label_k2]) cluster = cluster.T cluster.columns = ["cluster_2"] cluster.to_csv("kmeans_pathway_cluster.csv") ``` ## Trajectory analysis (run in R) ``` library(monocle3) # read the meta file contains cells name of ST data and the cell types meta <- read.csv("path_to_meta_file") path <- "path_to_pathway_expression_folder" f <- list.files(path) # read the pathwaye expression pthw_v <- NULL for(i in seq(f)){ d <- read.csv(paste0(path, f[i])) # you might need to change "pathway_expression" depends on your pthw_v[[gsub(".csv", "", f[i])]] <- d$pathway_expression } # prepare the matrix for monocle3 pthw_mat <- do.call(cbind, pthw_v) pthw_mat <- t(pthw_mat) colnames(pthw_mat) <- meta$cells gene_meta <- data.frame(gene_short_name = rownames(pthw_mat), n_cells = rowSums(pthw_mat > 0)) cell_meta <- data.frame(cell = colnames(pthw_mat)) cell_meta$cell.type <- meta$celltype cell_meta$num_gene_expressed <- colSums(pthw_mat > 0) rownames(cell_meta) <- colnames(pthw_mat) # run monocle3 cds <- new_cell_data_set(pthw_mat, cell_meta, gene_meta) cds <- preprocess_cds(cds, num_dim = 50) cds <- reduce_dimension(cds) cds <- cluster_cells(cds) cds <- learn_graph(cds) cds <- order_cells(cds, root_cells="a_cell_name") # you can choose the cell you want as the root traj.plot <- plot_cells(cds, color_cells_by = "pseudotime") d_pseudotime <- traj.plot["data"] # this will be the dataframe contains pseudotime for each cell ``` # ISS Mouse Primary Visula Cortex Dataset The dataset can be downloaded from https://github.com/spacetx-spacejam/data (ISS, mouse VISP). The scRNA-seq dataset can be downloaded from https://portal.brain-map.org/atlases-and-data/rnaseq/mouse-v1-and-alm-smart-seq (Primary Visual Cortex (VISp)) ``` # the following script is written in py3 import pandas as pd import numpy as np import scanpy as sc import random import os, sys sys.path.append('./pasta') import __init__ import _version import optimizer import mapper import utils sp = pd.read_csv("../mouse_visp_iss/HCA09_2_cellxgene.csv") #sp = sp.loc[sp["centroidX"] < 30000, :] coords = sp.iloc[:, 120:122] sp = sp.iloc[:, 1:120] # the cluster can be found in our GitHub repo # the cluster was obtained using the R package Seurat cluster_f = pd.read_csv("./mm09_seurat_cluster.csv") # load the scRNA-seq sc_df = pd.read_csv("./mouse_VISp_2018-06-14_intron-matrix.csv") # downloaded from Allen institute sc_gene = pd.read_csv("./mouse_VISp_2018-06-14_genes-rows.csv") sc_df.index = sc_gene["gene_symbol"] sc_df = sc_df.drop("Unnamed: 0", axis=1) sc_df = sc_df.T # select cells so that we can shorter the training time random.seed(2333) idx = random.sample(range(sc_df.shape[0]), 3000) sc_df_f = sc_df.iloc[idx, :] # change the dataframe to anndata format sp_adata = sc.AnnData(mat_f) sc_adata = sc.AnnData(sc_df_f) # filter the low expressed genes sc.pp.filter_genes(sp_adata, min_cells=1) sc.pp.filter_genes(sc_adata, min_cells=1) # train the model mapper.pp_adatas(sc_adata, sp_adata) map = mapper.mapping(sc_adata, sp_adata, pthw_genes, sp_coords=coords[["centroidX", "centroidY"]], ncell_thres=10, sp_celltypes=cluster_f["cluster"], lambda_g2=2, num_epochs=500) # get the predictions pthw_exp = utils.project_genes(adata_map=map, adata_sc=sc_adata, pthw=genes) # we can save the predictions as csv file for the downstream analysis d = pd.DataFrame([pthw_exp.values, coords.loc[:, "centroidX"], coords.loc[:, "centroidY"]]) d = d.T d.columns = ["pthw_exp", "x", "y"] d.to_csv("./predicted.csv") ``` # CosMx Human frontal cortex dataset ## Imputation The ST dataset can be downloaded from https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/human-frontal-cortex-ffpe-dataset/. The scRNA-seq dataset is collected from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE104276. ``` # the following script is written in py3 import pandas as pd import numpy as np import scanpy as sc import random import os, sys sys.path.append('./pasta') import __init__ import _version import optimizer import mapper import utils # load the ST dataset # teh expression and coords are obtained from the CosMX website. mat = pd.read_csv("expression_rnanormalized.csv") mat.index = mat.iloc[:, 0] mat_f = mat.drop("Unnamed: 0", axis=1) mat_f = mat_f.T coords = pd.read_csv("coords.csv") coords.index = coords.iloc[:, 0] # load the scRNA-seq dataset sc_df = pd.read_csv("../cosmx/hm_frontal_cortex/GSE104276_count.csv") sc_df.index = sc_df.iloc[:, 0] sc_df = sc_df.drop("Unnamed: 0", axis=1) sc_df = sc_df.T # change the dataframe to anndata format sp_adata = sc.AnnData(mat_f) sc_adata = sc.AnnData(sc_df_f) # filter the low expressed genes sc.pp.filter_genes(sp_adata, min_cells=1) sc.pp.filter_genes(sc_adata, min_cells=1) # train the model mapper.pp_adatas(sc_adata, sp_adata) map = mapper.mapping(sc_adata, sp_adata, pthw_genes, sp_coords=coords[["x_slide_mm", "y_slide_mm"]], ncell_thres=10, sp_celltypes=coords["RNA_nbclust_clusters_refined"], lambda_g2=2, num_epochs=500) # get the predictions pthw_exp = utils.project_genes(adata_map=map, adata_sc=sc_adata, pthw=genes) # we can save the predictions as csv file for the downstream analysis d = pd.DataFrame([pthw_exp.values, coords.loc[:, "x_slide_mm"], coords.loc[:, "y_slide_mm"]]) d = d.T d.columns = ["pthw_exp", "x", "y"] d.to_csv("./predicted.csv") ``` ## Apply tissue ``` # the step by step tissue tutorial can be found from its github: https://github.com/sunericd/TISSUE import tissue.main, tissue.downstream import numpy as np import pandas as pd import matplotlib.pyplot as plt import scanpy as sc import anndata as ad import os import seaborn as sns # first, we read the predictions # each csv file should be the pathway-by-cell expressoin matrix. tangram = pd.read_csv("path_to_tangram_pathway_pred.csv", index_col=0) spage = pd.read_csv("path_to_spage_pathway_pred.csv", index_col=0) seurat = pd.read_csv("path_to_seurat_pathway_pred.csv", index_col=0) stdiff = pd.read_csv("path_to_stdiff_pathway_pred.csv", index_col=0) stplus = pd.read_csv(".path_to_stplus_pathway_pred.csv", index_col=0) gimvi = pd.read_csv(".path_to_gimvi_pathway_pred.csv", index_col=0) pasta = pd.read_csv("path_to_pasta_pathway_predict.csv", index_col=0) true = pd.read_csv("path_to_true_pathway_pred.csv", index_col=0) # read coordinates coords = pd.read_csv("path_to_coords.csv", index_col=0) # set index if the prediction matrix index is not cell name true.columns = coords.index tangram.columns = coords.index pasta.columns = coords.index spage.columns = coords.index seurat.columns = coords.index stplus.columns = coords.index gimvi.columns = coords.index stdiff.columns = coords.index # set up anndata for tissue myadata = sc.AnnData(true.T) myadata.obsm["spatial"] = coords[["x_slide_mm", "y_slide_mm"]] myadata.obsm["tangram_predict"] = tangram.T myadata.obsm["pasta_predict"] = pasta.T myadata.obsm["spage_predict"] = spage.T myadata.obsm["seurat_predict"] = seurat.T myadata.obsm["stplus_predict"] = stplus.T myadata.obsm["gimvi_predict"] = gimvi.T myadata.obsm["stdiff_predict"] = stdiff.T tissue.main.build_spatial_graph(myadata, method="fixed_radius", n_neighbors=15) tissue.main.conformalize_spatial_uncertainty(myadata, "pasta_predict", calib_genes=myadata.var_names, grouping_method="kmeans_gene_cell", k=4, k2=2) tissue.main.conformalize_spatial_uncertainty(myadata, "tangram_predict", calib_genes=myadata.var_names, grouping_method="kmeans_gene_cell", k=4, k2=2) tissue.main.conformalize_spatial_uncertainty(myadata, "spage_predict", calib_genes=myadata.var_names, grouping_method="kmeans_gene_cell", k=4, k2=2) tissue.main.conformalize_spatial_uncertainty(myadata, "stplus_predict", calib_genes=myadata.var_names, grouping_method="kmeans_gene_cell", k=4, k2=2) tissue.main.conformalize_spatial_uncertainty(myadata, "gimvi_predict", calib_genes=myadata.var_names, grouping_method="kmeans_gene_cell", k=4, k2=2) tissue.main.conformalize_spatial_uncertainty(myadata, "seurat_predict", calib_genes=myadata.var_names, grouping_method="kmeans_gene_cell", k=4, k2=2) tissue.main.conformalize_spatial_uncertainty(myadata, "stdiff_predict", calib_genes=myadata.var_names, grouping_method="kmeans_gene_cell", k=4, k2=2) # get the score nn_pasta = myadata.obsm["pasta_predict_score"].to_numpy().flatten() nn_tangram = myadata.obsm["tangram_predict_score"].to_numpy().flatten() nn_spage = myadata.obsm["spage_predict_score"].to_numpy().flatten() nn_seurat = myadata.obsm["seurat_predict_score"].to_numpy().flatten() nn_gimvi = myadata.obsm["gimvi_predict_score"].to_numpy().flatten() nn_stplus = myadata.obsm["stplus_predict_score"].to_numpy().flatten() nn_stdiff = myadata_stdiff.obsm["stdiff_predict_score"].to_numpy().flatten() dd = pd.DataFrame([nn_pasta, nn_tangram, nn_spage, nn_seurat, nn_stplus, nn_stdiff, nn_gimvi]) dd.index = ["PASTA", "Tangram", "SpaGE", "Seurat", "GimVI", "StPlus", "StDiff"] dd_box = dd.T p.clear() fig.clear() p = sns.kdeplot(dd_box) plt.xlim(0, 10) fig = p.get_figure() fig.savefig("overall_density.png") ```