@article{97df3f0e34c246c8b0d44514ed4c83db,
title = "Missing data is poorly handled and reported in prediction model studies using machine learning: a literature review",
abstract = "Objectives: Missing data is a common problem during the development, evaluation, and implementation of prediction models. Although machine learning (ML) methods are often said to be capable of circumventing missing data, it is unclear how these methods are used in medical research. We aim to find out if and how well prediction model studies using machine learning report on their handling of missing data. Study design and setting: We systematically searched the literature on published papers between 2018 and 2019 about primary studies developing and/or validating clinical prediction models using any supervised ML methodology across medical fields. From the retrieved studies information about the amount and nature (e.g. missing completely at random, potential reasons for missingness) of missing data and the way they were handled were extracted. Results: We identified 152 machine learning-based clinical prediction model studies. A substantial amount of these 152 papers did not report anything on missing data (n = 56/152). A majority (n = 96/152) reported details on the handling of missing data (e.g., methods used), though many of these (n = 46/96) did not report the amount of the missingness in the data. In these 96 papers the authors only sometimes reported possible reasons for missingness (n = 7/96) and information about missing data mechanisms (n = 8/96). The most common approach for handling missing data was deletion (n = 65/96), mostly via complete-case analysis (CCA) (n = 43/96). Very few studies used multiple imputation (n = 8/96) or built-in mechanisms such as surrogate splits (n = 7/96) that directly address missing data during the development, validation, or implementation of the prediction model. Conclusion: Though missing values are highly common in any type of medical research and certainly in the research based on routine healthcare data, a majority of the prediction model studies using machine learning does not report sufficient information on the presence and handling of missing data. Strategies in which patient data are simply omitted are unfortunately the most often used methods, even though it is generally advised against and well known that it likely causes bias and loss of analytical power in prediction model development and in the predictive accuracy estimates. Prediction model researchers should be much more aware of alternative methodologies to address missing data.",
keywords = "Bias, Data Interpretation, Statistical, Humans, Machine Learning, Models, Statistical, Prognosis",
author = "Nijman, {S. W.J.} and Leeuwenberg, {A. M.} and I. Beekers and I. Verkouter and J.J.L. Jacobs and Bots, {M. L.} and Asselbergs, {F. W.} and Moons, {K. G.M.} and Debray, {T. P.A.}",
note = "Funding Information: The UCC is primarily financed by the UMC Utrecht. A grant from the Netherlands Organization for Health Research and Development (#8480-34001) was obtained to develop feedback procedures. UCC website: www.umuctrecht.nl/ucc (in Dutch). Contact information UCC:
[email protected]. SWJN is supported by a Public-Private Study grant of the Netherlands Heart foundation for the CVRM-IMPROVE project (#2018B006). This Research Project is financed by the PPP Allowance made available by Top Sector Life Sciences & Health to Netherlands Heart Foundation to stimulate public-private partnerships. TPAD is supported by the Netherlands Organisation for Health Research and Development (#91617050). The data that support the findings of this study are available from upon reasonable request. This study was conducted on behalf of the Utrecht Cardiovascular Cohort- CardioVascular Risk Management (UCC- CVRM) study group. Members of the UCC- CVRM Study group: F.W. Asselbergs, Department of Cardiology; G.J. de Borst, Department of Vascular Surgery; M.L. Bots (chair),Julius Center for Health Sciences and Primary Care; S. Dieleman, Division of Vital Functions (anesthesiology and intensive care); M.H. Emmelot, Department of Geriatrics; P.A. de Jong, Department of Radiology; A.T. Lely, Department of Obstetrics/Gynecology; I.E. Hoefer, Laboratory of Clinical Chemistry and Hematology; N.P. van der Kaaij, Department of Cardiothoracic Surgery; Y.M. Ruigrok, Department of Neurology; M.C. Verhaar, Department of Nephrology & Hypertension, F.L.J. Visseren, Department of Vascular Medicine, University Medical Center Utrecht and Utrecht University, We are grateful to the authors of the original review for the search conducted. Conflict of Interest: All authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper. Funding Information: The UCC is primarily financed by the UMC Utrecht. A grant from the Netherlands Organization for Health Research and Development (#8480-34001 ) was obtained to develop feedback procedures. UCC website: www.umuctrecht.nl/ucc (in Dutch). Contact information UCC:
[email protected]. Funding Information: SWJN is supported by a Public-Private Study grant of the Netherlands Heart foundation for the CVRM-IMPROVE project (#2018B006 ). This Research Project is financed by the PPP Allowance made available by Top Sector Life Sciences & Health to Netherlands Heart Foundation to stimulate public-private partnerships. TPAD is supported by the Netherlands Organisation for Health Research and Development ( #91617050 ). Publisher Copyright: {\textcopyright} 2021 The Authors",
year = "2022",
month = feb,
doi = "10.1016/j.jclinepi.2021.11.023",
language = "English",
volume = "142",
pages = "218--229",
journal = "Journal of Clinical Epidemiology",
issn = "0895-4356",
publisher = "Elsevier USA",
}