% pubman genre = article @article{item_3257169, title = {{Sourcepredict: Prediction of metagenomic sample sources using dimension reduction followed by machine learning classification}}, author = {Borry, Maxime}, language = {eng}, issn = {2475-9066}, doi = {10.21105/joss.01540}, year = {2019}, abstract = {{SourcePredict is a Python package distributed through Conda, to classify and predict the{\textless}br{\textgreater}origin of metagenomic samples, given a reference dataset of known origins, a problem also{\textless}br{\textgreater}known as source tracking.{\textless}br{\textgreater}DNA shotgun sequencing of human, animal, and environmental samples has opened up new{\textless}br{\textgreater}doors to explore the diversity of life in these different environments, a field known as metagenomics{\textless}br{\textgreater}(Hugenholtz {\&} Tyson, 2008). One aspect of metagenomics is investigating the community{\textless}br{\textgreater}composition of organisms within a sequencing sample with tools known as taxonomic{\textless}br{\textgreater}classifiers, such as Kraken (Wood {\&} Salzberg, 2014).{\textless}br{\textgreater}In cases where the origin of a metagenomic sample, its source, is unknown, it is often part of the{\textless}br{\textgreater}research question to predict and/or confirm the source. For example, in microbial archaelogy,{\textless}br{\textgreater}it is sometimes necessary to rely on metagenomics to validate the source of paleofaeces.{\textless}br{\textgreater}Using samples of known sources, a reference dataset can be established with the taxonomic{\textless}br{\textgreater}composition of the samples, i.e., the organisms identified in the samples as features, and the{\textless}br{\textgreater}sources of the samples as class labels.{\textless}br{\textgreater}With this reference dataset, a machine learning algorithm can be trained to predict the source{\textless}br{\textgreater}of unknown samples (sinks) from their taxonomic composition.{\textless}br{\textgreater}Other tools used to perform the prediction of a sample source already exist, such as Source-{\textless}br{\textgreater}Tracker (Knights et al., 2011), which employs Gibbs sampling.{\textless}br{\textgreater}However, the Sourcepredict results are more easily interpreted since the samples are embedded{\textless}br{\textgreater}in a human observable low-dimensional space. This embedding is performed by a dimension{\textless}br{\textgreater}reduction algorithm followed by K-Nearest-Neighbours (KNN) classification.}}, contents = {Summary Method - Prediction of the proportion of unknown sources - Prediction of the proportion of known sources - Combining unknown and source proportions}, journal = {{The Journal of Open Source Software}}, eid = {01540}, }