Simulation data
Our simulation based on a list of stere-seq datasets which can be downloaded at https://db.cngb.org/stomics/artista/download/.
Basic simulation code for PASTA
import pandas as pd
import numpy as np
import scanpy as sc
import random
import anndata
import scipy
import statistics
import os, sys
sys.path.append('./pasta')
import __init__
import _version
import optimizer
import mapper
import utils
ss_df = anndata.read_h5ad("path_to_the_h5ad_file")
pearson_cor = []
spearman_cor = []
mse = []
random.seed(233)
for i in range(30):
#random sample 3000 cells and 1000 genes as spatial
cells_n = random.sample(range(ss_df.shape[0]), 3000) ## modify if different region/celltype/subject
genes_n = random.sample(range(ss_df.shape[1]), 1000)
sp_adata = ss_df[cells_n, genes_n]
true_adata = ss_df[cells_n, ]
ss_df_2 = ss_df[~ss_df.obs.index.isin(sp_adata.obs.index), :]
cells_n2 = random.sample(range(ss_df_2.shape[0]), 6000)
sc_adata = ss_df_2[cells_n2, ]
# remove all-zero-valued genes
sc.pp.filter_genes(sp_adata, min_cells=1)
sc.pp.filter_genes(sc_adata, min_cells=1)
sc.pp.filter_genes(true_adata, min_cells=1)
# get dist information
dist = sp_adata.obsm["spatial"]
dist = pd.DataFrame.from_records(dist)
# random sample pathway
idx_n = random.sample(range(sp_adata.shape[1]), 50)
pathway = pd.DataFrame(sp_adata.var.index[idx_n])
# train the model
mapper.pp_adatas(sc_adata, sp_adata)
ad_map = mapper.mapping(sc_adata, sp_adata, genes, sp_coords=coords, ncell_thres=10,
sp_celltypes=cluster["Cluster"], lambda_g2=2, num_epochs=500)
pthw_exp = utils.project_genes(adata_map=ad_map, adata_sc=sc_adata, pthw=genes)
true_df = true_adata.to_df()
true = true_df.loc[:, true_df.columns.isin(pathway["new_name"].str.lower())]
true_v = true.sum(axis=1)
pearson_cor.append(scipy.stats.pearsonr(pred_v, true_v)[0])
spearman_cor.append(scipy.stats.spearmanr(pred_v, true_v)[0])
mse.append(((pred_v - true_v)**2).mean())
Basic simulation code for Tangram
import pandas as pd
import numpy as np
import scanpy as sc
import random
import os, sys
sys.path.append('./') # uncomment for local import
import tangram as tg
import anndata
import scipy
import statistics
ss_df = anndata.read_h5ad("path_to_the_h5ad_file")
pearson_cor = []
spearman_cor = []
mse = []
random.seed(233)
for i in range(30):
#random sample 3000 cells and 1000 genes as spatial
cells_n = random.sample(range(ss_df.shape[0]), 3000) ## modify if different region/celltype/subject
genes_n = random.sample(range(ss_df.shape[1]), 1000)
sp_adata = ss_df[cells_n, genes_n]
true_adata = ss_df[cells_n, ]
ss_df_2 = ss_df[~ss_df.obs.index.isin(sp_adata.obs.index), :]
cells_n2 = random.sample(range(ss_df_2.shape[0]), 6000)
sc_adata = ss_df_2[cells_n2, ]
# remove all-zero-valued genes
sc.pp.filter_genes(sp_adata, min_cells=1)
sc.pp.filter_genes(sc_adata, min_cells=1)
sc.pp.filter_genes(true_adata, min_cells=1)
# get dist information
dist = sp_adata.obsm["spatial"]
dist = pd.DataFrame.from_records(dist)
# random sample pathway
idx_n = random.sample(range(sp_adata.shape[1]), 50)
pathway = pd.DataFrame(sp_adata.var.index[idx_n])
tg.pp_adatas(sc_adata, sp_adata)
ad_map = tg.map_cells_to_space(sc_adata, sp_adata, num_epochs=1000)
ad_ge = tg.project_genes(adata_map=ad_map, adata_sc=sc_adata)
ge_df = ad_ge.to_df()
pred = ge_df.loc[:, ge_df.columns.isin(pathway["new_name"].str.lower())]
pred_v = pred.sum(axis=1)
true_df = true_adata.to_df()
true = true_df.loc[:, true_df.columns.isin(pathway["new_name"])]
true_v = true.sum(axis=1)
pearson_cor.append(scipy.stats.pearsonr(pred_v, true_v)[0])
spearman_cor.append(scipy.stats.spearmanr(pred_v, true_v)[0])
mse.append(((pred_v - true_v)**2).mean())
Basic simulation code for Seurat (run in R)
library(Seurat)
data <- readRDS("path_to_the_data.rds") # stereo-seq data has RDS file which can be read in R
pearson_r <- NULL
spearman_r <- NULL
mse <- NULL
for(i in seq(30)){
set.seed(i)
# sample ST and scRNA-seq data
cell_id <- sample(seq(dim(data)[2]), 3000)
gene_id <- sample(seq(dim(data)[1]), 1000)
cell_id2 <- sample(seq(dim(data)[2])[-cell_id], 3000)
spatial_d <- data@assays[["SCT"]]@counts[gene_id, cell_id]
sc_d <- data@assays[["SCT"]]@counts[, cell_id2]
# run seurat
sp_seurat <- CreateSeuratObject(counts = spatial_d, project = 'sp', assay = 'RNA')
sp_seurat <- NormalizeData(sp_seurat)
sp_seurat <- ScaleData(sp_seurat)
train_seu <- NormalizeData(object = train_seu)
train_seu <- FindVariableFeatures(object = train_seu, nfeatures = 2000)
train_seu <- ScaleData(object = train_seu)
train_seu <- RunPCA(object = train_seu, npcs = 50, verbose = FALSE)
train_seu <- RunUMAP(object = train_seu, dims = 1:50, nneighbors = 5)
features <- rownames(sp_seurat)
anchors <- FindTransferAnchors(
reference = train_seu,
query = sp_seurat,
features = features,
dims = 1:30,
reduction = 'cca')
refdata <- GetAssayData(object = train_seu)
imputation <- TransferData(anchorset = anchors,
refdata = refdata,
query = sp_seurat,
weight.reduction = 'pca')
pred_d <- imputation@assays[["id"]]@data
pathway <- c(sample(rownames(st_d), 70)) # if all observed in st
pred <- pred_d[rownames(pred_d) %in% pathway, ]
pred_v <- colSums(pred)
true_d <- data@assays[["SCT"]]@counts[, cell_id]
true_f <- true_d[rownames(true_d) %in% pathway, ]
true_v <- colSums(true)
pearson_r <- c(pearson_r, cor(pred_v, true_v))
spearman_r <- c(spearman_r, cor(pred_v, true_v, method="spearman"))
mse <- c(mse, mean((pred_v - true_v)^2))