% pubman genre = article @article{doi:10.1093/gbe/evy199, title = {{Turning Vice into Virtue: Using Batch-Effects to Detect Errors in Large Genomic Data Sets}}, author = {Mafessoni, Fabrizio and Prasad, Rashmi B. and Groop, Leif and Hansson, Ola and Pr{\"u}fer, Kay}, language = {eng}, issn = {2192-8851}, doi = {10.1093/gbe/evy199}, publisher = {Oxford University Press}, address = {Oxford}, year = {2018}, date = {2018-10}, abstract = {{It is often unavoidable to combine data from different sequencing centers or sequencing platforms when compiling data sets{\textless}br{\textgreater}with a large number of individuals. However, the different data are likely to contain specific systematic errors that will appear{\textless}br{\textgreater}as SNPs. Here, we devise a method to detect systematic errors in combined data sets. To measure quality differences between{\textless}br{\textgreater}individual genomes, we study pairs of variants that reside on different chromosomes and co-occur in individuals. The{\textless}br{\textgreater}abundance of these pairs of variants in different genomes is then used to detect systematic errors due to batch effects.{\textless}br{\textgreater}Applying our method to the 1000 Genomes data set, we find that coding regions are enriched for errors, where{\textless}br{\textgreater}{\textless}br{\textgreater}1{\textpercent} of the{\textless}br{\textgreater}higher frequency variants are predicted to be erroneous, whereas errors outside of coding regions are much rarer{\textless}br{\textgreater}({\textless}br{\textgreater}{\textless}{\textless}br{\textgreater}0.001{\textpercent}). As expected, predicted errors are found less often than other variants in a data set that was generated with{\textless}br{\textgreater}a different sequencing technology, indicating that many of the candidates are indeed errors. However, predicted 1000{\textless}br{\textgreater}Genomes errors are also found in other large data sets; our observation is thus not specific to the 1000 Genomes data set. Our{\textless}br{\textgreater}results show that batch effects can be turned into a virtue by using the resulting variation in large scale data sets to detect{\textless}br{\textgreater}systematic errors.}}, journal = {{Genome Biology and Evolution}}, volume = {10}, number = {10}, pages = {2697--2708}, }