spqrp on mock data

This vignette mirrors Example_Mock_Data.Rmd from the Python package, but uses the native R package. No Python install is required.

Load data

The package ships a small mock cohort and a matching protein ranking, both in long format with the required columns (Sample_ID, Patient_ID, Protein, Intensity).

library(spqrp)

df      <- spqrp_example_data("input_cohort_df")
ranking <- spqrp_example_data("protein_ranking")

head(df)
#> # A tibble: 6 × 4
#>   Sample_ID Patient_ID Protein Intensity
#>   <chr>     <chr>      <chr>       <dbl>
#> 1 1_S1      P001       ProtA        0.39
#> 2 1_S1      P001       ProtB        1   
#> 3 1_S1      P001       ProtC        0.7 
#> 4 1_S1      P001       ProtD        0.57
#> 5 1_S1      P001       ProtE        0.13
#> 6 1_S2      P001       ProtA        0.33
ranking
#> # A tibble: 5 × 2
#>   Protein Importance
#>   <chr>        <dbl>
#> 1 ProtA         0.95
#> 2 ProtC         0.89
#> 3 ProtB         0.72
#> 4 ProtD         0.12
#> 5 ProtE         0.01

Run clustering

run_clustering() builds a kNN graph in a 2D embedding (PCA / MDS / UMAP), iteratively splits oversized components, and visualises the result.

Use n_neighbors = expected_samples_per_patient - 1 and max_component_size = expected_samples_per_patient for the typical case where every patient should appear in their own small cluster.

res <- run_clustering(
  df = df, ranking = ranking,
  n_neighbors = 1L,
  max_component_size = 2L,
  metric = "manhattan",
  method = "PCA",  # switch to "UMAP" once a cohort is large enough
  plot_name = "Mock ranking on Mock data",
  #quiet = FALSE.
  #save_path = Mock_data_clustering.png
)

It is recommended to specify ‘save_path’ to save the clustering with a better visualization with less cramped clusters (adjustable figsize and dpi). The plot in res$plot is a ggplot2 object — you can save or modify it like any other ggplot. Cluster bookkeeping is also returned as plain R lists:

head(res$cluster_assignments, 8)
#>  1_S1  1_S2 10_S1 10_S2 11_S1 11_S2 12_S1 12_S2 
#>     1     1     2     2     3     3     4     5
res$uncertain_samples
#> character(0)
res$error_candidate_samples
#> [1] "12_S1" "5_S2"  "12_S2" "5_S1"
res$plot

res$transitive_results
#> $precision
#> [1] 0.9333333
#> 
#> $sensitivity
#> [1] 0.9333333
#> 
#> $f1
#> [1] 0.9333333
#> 
#> $accuracy
#> [1] 0.9977401
#> 
#> $`balanced accuracy`
#> [1] 0.966092
#> 
#> $ari
#> [1] 0.9321839
#> 
#> $nmi
#>         1 
#> 0.9864137 
#> 
#> $false_negatives
#> $false_negatives[[1]]
#> [1] "12_S1" "12_S2"
#> 
#> $false_negatives[[2]]
#> [1] "5_S1" "5_S2"

cluster_assignments — named integer vector mapping sample → cluster ID
uncertain_samples — samples that should have a same-patient match in the cohort but ended up isolated
error_candidate_samples — samples connected to a different-patient neighbour (i.e. probable mix-ups)

Threshold-based evaluation

perform_distance_evaluation_on_ranked_proteins() computes pairwise distances on the top-n ranked proteins, picks a cutoff at the p-th percentile, and reports classification metrics against the patient ID ground truth.

result <- perform_distance_evaluation_on_ranked_proteins(
  df = df,
  top_importance_df = ranking,
  metric = "manhattan",
  p = 0.989,
  n = 4L,
)
result$cutoff
#> [1] 0.21
result$eval_metrics[c("TP", "FP", "FN", "TN", "Precision", "Sensitivity", "F1")]
#> $TP
#> [1] 15
#> 
#> $FP
#> [1] 4
#> 
#> $FN
#> [1] 15
#> 
#> $TN
#> [1] 1736
#> 
#> $Precision
#> [1] 0.7894737
#> 
#> $Sensitivity
#> [1] 0.5
#> 
#> $F1
#> [1] 0.6122449
result$plot

Compute your own protein ranking

If you don’t have a precomputed ranking, train a pairwise random-forest classifier on the raw data. Three balancing strategies are available via classifier_backend; the differences are documented at https://github.com/fhradilak/spqrp_r/blob/main/articles/numerical-divergence.md.

Avoid applying preprocessing or filtering steps to the data frame you pass in. Anything that uses cohort-wide information — median normalization, occurrence filtering, outlier detection on the full cohort — leaks the held-out fold into the training fold and inflates the importance estimates. The built-in outlier_removal = TRUE option is safe: it fits the isolation forest inside each train/test split rather than on the whole cohort.

results <- train_with_normalise(
  df,
  plate_corrected   = FALSE,  # mock data has no plate column
  outlier_removal   = FALSE   # skip on tiny data
  # classifier_backend defaults to "randomForest" — closest to imblearn's
  # BalancedRandomForestClassifier. Pass "ranger" for a faster (but more
  # divergent from original Python package) backend.
)

new_ranking <- retrieve_ranking(results)
new_ranking
#> # A tibble: 5 × 2
#>   Protein Importance
#>   <chr>        <dbl>
#> 1 ProtD        0.249
#> 2 ProtE        0.242
#> 3 ProtB        0.185
#> 4 ProtC        0.177
#> 5 ProtA        0.147

Filtering (optional)

Isolation-forest-based outlier detection flags samples whose protein profiles look anomalous relative to the rest of the cohort. Skip it on very small cohorts (the algorithm has little to learn from < ~30 samples) — it’s mostly useful at production scale where one mis-pipetted sample can drag clustering off course.

# Drop flagged samples in one step:
filtered <- remove_outlier_samples(df, contamination = "auto")

filtered$outlier_list      # samples flagged as anomalous
#> [1] "22_S2" "24_S1" "24_S2" "25_S1" "25_S2" "28_S1" "3_S1"  "3_S2"
filtered$anomaly_df        # per-sample anomaly scores
#> # A tibble: 60 × 3
#>    Sample_ID `Anomaly Score` Outlier
#>    <chr>               <dbl> <lgl>  
#>  1 10_S1               0.590 FALSE  
#>  2 10_S2               0.584 FALSE  
#>  3 11_S1               0.579 FALSE  
#>  4 11_S2               0.573 FALSE  
#>  5 12_S1               0.563 FALSE  
#>  6 12_S2               0.573 FALSE  
#>  7 13_S1               0.576 FALSE  
#>  8 13_S2               0.577 FALSE  
#>  9 14_S1               0.566 FALSE  
#> 10 14_S2               0.565 FALSE  
#> # ℹ 50 more rows
head(filtered$df)          # input df minus the flagged samples
#> # A tibble: 6 × 4
#>   Sample_ID Patient_ID Protein Intensity
#>   <chr>     <chr>      <chr>       <dbl>
#> 1 1_S1      P001       ProtA        0.39
#> 2 1_S1      P001       ProtB        1   
#> 3 1_S1      P001       ProtC        0.7 
#> 4 1_S1      P001       ProtD        0.57
#> 5 1_S1      P001       ProtE        0.13
#> 6 1_S2      P001       ProtA        0.33

#Inspect the score distribution before deciding on a cutoff:
filtered$anomaly_plot

# Or call the underlying detector directly to keep the original df and
# only act on the flag list — useful when you want to surface candidates
# without auto-removing them:
forest <- by_isolation_forest(df, impute_median = TRUE,
                               contamination = 0.05)  # top 5% by score
forest$outlier_list

contamination = "auto" uses an anomaly-score cutoff calibrated for solitude (the underlying engine; see ?by_isolation_forest for why the default is 0.6 rather than sklearn’s 0.5). Pass a numeric fraction like contamination = 0.1 to flag a fixed percentile instead.

train_with_normalise() also runs the same filter internally via outlier_removal = TRUE (default), so you don’t need to call this explicitly before training for creating the protein ranking — only when you want to inspect or pre-filter the cohort (e.g. for the clustering or threshold approach) yourself.

The complete SQRP pipeline:

SPQRP consists of six main steps: preprocessing, protein selection with a Balanced Random Forest Classifier (BRF-Classifier), distance calculation, clustering-based classification, threshold-based classification, and performance calculation. The full SPQRP R packages functions can be used to run them in three stages: (1) preprocessing, (2) protein ranking via train_with_normalise(), and (3) clustering plus threshold-based evaluation on the filtered cohort (including, distance calculation, clustering- or respectively threshold-based classification, and performance calculation in one function). To follow it end-to-end, feed the new_ranking and (optionally) filtered data frame from stages 1–2 into run_clustering() and perform_distance_evaluation_on_ranked_proteins():

res <- run_clustering(
  df = filtered$df, ranking = new_ranking,
  n_neighbors = 1L,
  max_component_size = 2L,
  metric = "manhattan",
  method = "PCA",  # switch to "UMAP" once a cohort is large enough
  plot_name = "Mock ranking on Mock data",
  #quiet = FALSE.
  #save_path = filtered_mock_data_clustering.png
)

head(res$cluster_assignments, 8)
#>  1_S1  1_S2 10_S1 10_S2 11_S1 11_S2 12_S1 12_S2 
#>     1     1     2     2     3     3     4     5
res$uncertain_samples
#> character(0)
res$error_candidate_samples
#> [1] "12_S1" "5_S2"  "12_S2" "5_S1"
res$plot

result <- perform_distance_evaluation_on_ranked_proteins(
  df = filtered$df,
  top_importance_df = new_ranking,
  metric = "manhattan",
  p = 0.989,
  n = 4L,
)
result$cutoff
#> [1] 0.1910425
result$eval_metrics[c("TP", "FP", "FN", "TN", "Precision", "Sensitivity", "F1")]
#> $TP
#> [1] 13
#> 
#> $FP
#> [1] 1
#> 
#> $FN
#> [1] 12
#> 
#> $TN
#> [1] 1300
#> 
#> $Precision
#> [1] 0.9285714
#> 
#> $Sensitivity
#> [1] 0.52
#> 
#> $F1
#> [1] 0.6666667
result$plot

Preprocessing (optional)

The package mirrors the Python preprocessing pipeline. None of these steps are required for clustering — they’re convenience helpers if you want to start from raw intensities.

df_raw <- spqrp_example_data("input_cohort_df")

# Zeros become NA so missingness is explicit
df_raw$Intensity[df_raw$Intensity == 0] <- NA

df_pp <- df_raw |>
  log_transform() |>
  filter_by_occurrence(0.7)

# Per-sample median normalization (returns list(data, plot))
norm <- normalize_medianintensity(df_pp, plot = FALSE)
df_pp <- norm$data

# Plate-effect residualisation if a `plate` column exists
df_pp <- plate_correct_residuals_by_protein(df_pp)