Exporting Data

library(SEEPS)
#>                               __   
#>                              \ \  
#>  _____________________________\ \ 
#> /_____/_____/_____/_____/_____/ / 
#>                              /_/  
#>    _____ ________________  _____  
#>   / ___// ____/ ____/ __ \/ ___/  
#>   \__ \/ __/ / __/ / /_/ /\__ \   
#>  ___/ / /___/ /___/ ____/___/ /   
#> /____/_____/_____/_/    /____/    
#>   __                              
#>  / /                              
#> / / ______________________________
#> \ \/_____/_____/_____/_____/_____/
#>  \_\                              
#> 
#> 
#> Attaching package: 'SEEPS'
#> The following object is masked from 'package:stats':
#> 
#>     step
#> The following object is masked from 'package:methods':
#> 
#>     initialize
require(ape)
#> Loading required package: ape

In this tutorial, we’ll look at how to export data from a SEEPS simulation. SEEPS generates three kinds of data for export:

  1. Trees (newick string format)
  2. Pairwise distance matrices (matrix)
  3. Simulated sequences (fasta)

Trees and pairwise distance matrices are generated from a genealogy or a phylogeny, which are coded in an array of the same structure.

Sample data

First, let’s generate some data for a population of 150 individuals with 15 samples (10% sampling coverage).

set.seed(1947)
params <- list(
    "rate_function_parameters" = list("R0" = 2),
    "minimum_population" = 5, "maximum_population_target" = 150,
    "total_steps_after_exp_phase" = 48, "mutation_rate" = 0.0067,
    "a" = 5, "b" = 5
)

biphasic_rate_function <- get_biphasic_HIV_rate(
    params = params[["rate_function_parameters"]]
)

simulator_result <- gen_transmission_history_exponential_constant(
    minimum_population = params[["minimum_population"]],
    offspring_rate_fn = biphasic_rate_function,
    total_steps = params[["total_steps_after_exp_phase"]],
    maximum_population_target = params[["maximum_population_target"]],
    spike_root = FALSE
)

target_sample <- random_fixed_size_ids( # Random sampling
    active = simulator_result[["active"]],
    minimum_size = params[["minimum_population"]],
    spike_root = FALSE
)

transmission_history <- reduce_transmission_history(
    samples = target_sample[["samples"]],
    parents = simulator_result[["parents"]],
    current_step = simulator_result[["t_end"]],
    spike_root = FALSE
)

# Get the sequence length
V3_sequence <- SEEPS::lookup_sequence_by_name(
    organism_name = "HIV1", # nolint: object_name_linter
    region_name = "Env-Clinical"
)

# Randomize edge lengths
geneology <- stochastify_transmission_history(
    transmission_history = transmission_history$geneology,
    rate = params[["mutation_rate"]] / 12 * length(V3_sequence)
) # Provide in rate per sequence per year
# Get the rate model for the V3 region
rate_model <- SEEPS::get_V3_rate_model(nonzero_I = TRUE)

print(geneology)
#> $geneology
#>       [,1] [,2] [,3] [,4] [,5] [,6] [,7]
#>  [1,]    1    7    0   52    6    1  290
#>  [2,]    2    6    0   25    6    1  366
#>  [3,]    3    7    0   52   16    1  281
#>  [4,]    4    6    0   25    4    1  325
#>  [5,]    5    8    0   53    9    1  323
#>  [6,]    6    8   25   28    5    0    0
#>  [7,]    7    9   52    9    1    0    0
#>  [8,]    8    9   53    8    1    0    0
#>  [9,]    9    0   61    0    0    0    0
#> [10,]    0    0    0    0    0    0    0
sequences_fasta <- SEEPS::generate_sequences(
    phylogeny = geneology$geneology,
    branch_rate = params[["mutation_rate"]] / 12,
    root_sequence = V3_sequence,
    rate_model = rate_model,
    rate_per_nt = FALSE
) # Distances in phylogeny are already in per nt
newick_tree <- SEEPS::phylogeny_to_newick(geneology$geneology, mode = "mean", label_mode = "abs")
sequences_df <- SEEPS::fasta_string_to_dataframe(sequences_fasta)
matrix <- SEEPS::build_distance_matrix_from_df(sequences_df)

The last 3 lines of the above code are the only lines that are needed to prepare data for export.

Trees

A phylogeny or a genealogy can be converted to a Newick tree using phylogeny_to_newick. This function takes one required and two optional arguments.

  • The mode option selects between the expected number mean or the sampled number of mutations mu for the branch lengths.
  • The label_mode option determines whether to use abs (values in 1:number of simulated infections) or local (values of 1:sample size).

The returned Newick string can be read into ape and visualized.

print(newick_tree)
#> [1] "((290_:52,281_:52):9,(323_:53,(366_:25,325_:25):28):8);"
tree <- ape::read.tree(text = newick_tree)
plot(ape::ladderize(tree))

A ladderized phylogenetic tree plot

Alternatively, we can export the tree to a text file using standard library tools.

fileConnection <- file("tree.txt", "w")
writeLines(newick_tree, fileConnection)
close(fileConnection)

Sequences (fasta)

Sequence simulation is available with SeqGen. By default, generate_sequences returns a string that represents a fasta 2-line text file. This file can be immediately written out to storage.

print(sequences_fasta)
#> [1] ">290_\nCATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGTTAACTGGCAGTCTAGCAGAAGAAGAGGTAGTAATTAGATCTGTCAATTTCATGGACAATGATAAAACCATAATAGTACAGCTGAACACATCAGTAGAAATTAATCGTACAAGACCCAACAACAATACAAGAAAAGGAATCCGTATCCAGAGAGGACCAGGGAAAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTATAACATTAGTAGAGCAAAATGGAATAACACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAGTCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCATAGTTTTAATTGT\n"
#> [2] ">281_\nCATGGAATTAGGCCAGTAGTATCAAATCAACTGCTATTAAATGGCAGTCTAGCAGAAGAAGAGGTAGTAATTAGATCTGTCAATTTCACGGATAATGCTAAAACCATAATAGTACAGCTGAAAACATCTGTAGAAATTAATAGTACAAGACCCAACAACAATACAAGAAAAAGAATCCGTATCCAGAGAGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAAAGCAAAATGGAATAACACTATAAAACAGATAGCTAGCAAATTAATAGAACAATTTGGAAATAATGAAACAATAATCTTTAAACAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGT\n"
#> [3] ">323_\nCATGGAATTAGGCCAGAAGTATCAACTCAACTGCTATTAAATGGCAGTCTAGCAGAAGAAGAGGTAGTAATTAGATCTGTCAATTTCACGGACAATGATAAAACCATAATAGTACAGCTGAATACATCTGTAGAAATTAATTGTACAAGACCCAACAACTATACAAGAAAAAGAATCCGTATCCAGAGAGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATAACACTTTAAAAAAGAAAGCTAGCAAATCAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAGGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGT\n"
#> [4] ">366_\nCATGGAATTAGGCCAGTAGTATCAACTCAACGGCTATTAAATGGCAGTCTAGCAGAAGAAGAGGTAGTAATTAGATCTGTCAATTTCACGGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATCGTACAAGACCCAACAACAATACAAGAAAAAGAATCCGTATCCAGAGAGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTATAACATTGGTAGAGCAAAATGGAATAACACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAGATTATAAAACAATAGTCTTCAAGCAATCCTCAGGAGGAGACCCAGAAATTGTAACGCACAGTTTTAACTGT\n"
#> [5] ">325_\nCATGGAATTAGGCCAGTAGTATCAACTCAACTGCTATTAAATGGCAGTCTAGCAGAAGAAGAGGTAGTAATTAGATCTGTCAATTTCACGGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGAATCCGTACCCAGAGAGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTATAACATTGGTAGAGCAAAATGGAATAACACTTTAAAACAGATAGCTGGCAAATTAAGAGAACAATTTGGAGATAATAAAACAATAGTCTTCAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGT\n"
#> [6] ">root\nCATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAGAGGTAGTAATTAGATCTGTCAATTTCACGGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGAATCCGTATCCAGAGAGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATAACACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGT\n"
#> [7] ""

The root sequence used to seed SeqGen is also returned in the fasta output, with the name/identifier of root.

However, a serialization and reloading step is not necessary, if you are continuing to work with the sequences in R (to either run downstream analyses, or add sequencing artifacts). The fasta_string_to_dataframe function converts the string to a data frame, can be further manipulated before being written out to storage.

print(sequences_df)
#>            seq name
#> 1 CATGGAAT....  290
#> 2 CATGGAAT....  281
#> 3 CATGGAAT....  323
#> 4 CATGGAAT....  366
#> 5 CATGGAAT....  325

Pairwise distance matrices

A tree-free reduced representation of the evolutionary history of a set of samples is a pairwise distance matrix. Distances can be calculated either using the simulated tree (cophenetic distance metric) or calculated from simulated sequences (TN93).

Tree-based distance matrix

As a phylogeny or genealogy specifies the necessary branch lengths and topology of a tree, it can be used to calculate a distance matrix.

distance_matrix <- SEEPS::geneology_to_distance_matrix(
    geneology = geneology$geneology
)

print(distance_matrix)
#>     290 366 281 325 323
#> 290   0  19  22  17  17
#> 366  19   0  29  10  20
#> 281  22  29   0  27  27
#> 325  17  10  27   0  18
#> 323  17  20  27  18   0

Sequence-based distance matrix

The output of the sequence simulator can be converted directly to a distance matrix, as we did above.

sequences_df <- SEEPS::fasta_string_to_dataframe(sequences_fasta)
matrix <- SEEPS::build_distance_matrix_from_df(sequences_df)

This gives a matrix.

print(matrix)
#>          290      281      323       366       325
#> 290  0.00000 19.76165 17.63093 15.488872 14.476468
#> 281 19.76165  0.00000 16.48479 18.699251 16.588398
#> 323 17.63093 16.48479  0.00000 18.691663 15.488872
#> 366 15.48887 18.69925 18.69166  0.000000  7.121879
#> 325 14.47647 16.58840 15.48887  7.121879  0.000000

SeqGen uses the phylogenetic tree to determine when to branch a sequence, using a GTR+Γ model to select for mutations. The tree-based distance matrix approach does not use information about the originating sequence, resulting in different mutational patterns. These distances are rescaled by the length of the sequences (so the entries take values in [0, len(root sequence)] rather than [0, 1]), so they can be compared with the number of mutations in the tree-based distance matrix.