
<!DOCTYPE article
  PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD with MathML3 v1.3 20210610//EN" "JATS-archivearticle1-3-mathml3.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="1.3" article-type="editorial" xml:lang="en"><processing-meta tagset-family="jats" base-tagset="archiving" mathml-version="3.0" table-model="xhtml"><custom-meta-group><custom-meta assigning-authority="highwire" xlink:type="simple"><meta-name>recast-jats-build</meta-name><meta-value>d8e1462159</meta-value></custom-meta></custom-meta-group></processing-meta><front><journal-meta><journal-id journal-id-type="hwp">jitc</journal-id><journal-id journal-id-type="nlm-ta">J Immunother Cancer</journal-id><journal-id journal-id-type="publisher-id">jitc</journal-id><journal-title-group><journal-title>Journal for ImmunoTherapy of Cancer</journal-title><abbrev-journal-title abbrev-type="publisher">J Immunother Cancer</abbrev-journal-title><abbrev-journal-title>J Immunother Cancer</abbrev-journal-title></journal-title-group><issn pub-type="epub">2051-1426</issn><publisher><publisher-name>BMJ Publishing Group Ltd</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">jitc-2021-003044</article-id><article-id pub-id-type="doi">10.1136/jitc-2021-003044</article-id><article-id pub-id-type="apath" assigning-authority="highwire">/jitc/9/7/e003044.atom</article-id><article-categories><subj-group subj-group-type="heading"><subject>Commentary</subject></subj-group><subj-group subj-group-type="collection" assigning-authority="publisher"><subject>Open access</subject></subj-group><subj-group subj-group-type="collection" assigning-authority="publisher"><subject>Commentary/Editorials</subject></subj-group><subj-group subj-group-type="collection" assigning-authority="highwire"><subject>Special collections</subject><subj-group><subject>JITC</subject><subj-group><subject>Commentary/Editorials</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="collection" assigning-authority="highwire"><subject>Special collections</subject><subj-group><subject>Open access</subject></subj-group></subj-group></article-categories><title-group><article-title>Letter to the editor: radiomics analysis for predicting pembrolizumab response in patients with advanced rare cancers</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes" id="author-88284114" xlink:type="simple"><contrib-id contrib-id-type="orcid" authenticated="false">http://orcid.org/0000-0001-9101-8553</contrib-id><name name-style="western"><surname>Cunha</surname><given-names>Mateus Trinconi</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" id="author-88373756" xlink:type="simple"><name name-style="western"><surname>Carvalho</surname><given-names>Vinicius Jardim</given-names></name><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" id="author-88373780" xlink:type="simple"><contrib-id contrib-id-type="orcid" authenticated="false">http://orcid.org/0000-0002-1635-2225</contrib-id><name name-style="western"><surname>Loureiro</surname><given-names>Rafael Maffei</given-names></name><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author" id="author-88373805" xlink:type="simple"><name name-style="western"><surname>Brantis-de-Carvalho</surname><given-names>Carlos Eduardo</given-names></name><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author" id="author-88373733" xlink:type="simple"><name name-style="western"><surname>Cintra</surname><given-names>Murilo Bicudo</given-names></name><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author" id="author-79392269" xlink:type="simple"><name name-style="western"><surname>de Castro Junior</surname><given-names>Gilberto</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><label>1</label><institution content-type="department" xlink:type="simple">Medical Oncology</institution>, <institution xlink:type="simple">Instituto do Cancer do Estado de Sao Paulo, Faculdade de Medicina da Universidade de Sao Paulo</institution>, <addr-line content-type="city">Sao Paulo</addr-line>, <country>Brazil</country></aff><aff id="aff2"><label>2</label><institution content-type="department" xlink:type="simple">Computer Science</institution>, <institution xlink:type="simple">Instituto de Matematica e Estatistica, Universidade de Sao Paulo</institution>, <addr-line content-type="city">Sao Paulo</addr-line>, <country>Brazil</country></aff><aff id="aff3"><label>3</label><institution content-type="department" xlink:type="simple">Department of Radiology</institution>, <institution xlink:type="simple">Hospital Israelita Albert Einstein</institution>, <addr-line content-type="city">Sao Paulo</addr-line>, <country>Brazil</country></aff><aff id="aff4"><label>4</label><institution content-type="department" xlink:type="simple">Faculdade de Medicina</institution>, <institution xlink:type="simple">Centro Universitario Saude ABC</institution>, <addr-line content-type="city">Santo Andre</addr-line>, <addr-line content-type="state">SP</addr-line>, <country>Brazil</country></aff><aff id="aff5"><label>5</label><institution content-type="department" xlink:type="simple">Radiology</institution>, <institution xlink:type="simple">Diagnosticos da America</institution>, <addr-line content-type="city">Barueri</addr-line>, <country>Brazil</country></aff><aff id="aff6"><label>6</label><institution content-type="department" xlink:type="simple">Radiology</institution>, <institution xlink:type="simple">Instituto do Cancer do Estado de Sao Paulo, Faculdade de Medicina da Universidade de Sao Paulo</institution>, <addr-line content-type="city">Sao Paulo</addr-line>, <country>Brazil</country></aff><author-notes><corresp><label>Correspondence to</label> Dr Mateus Trinconi Cunha; <email xlink:type="simple">mateustcunha@gmail.com</email></corresp></author-notes><pub-date date-type="pub" iso-8601-date="2021-07" pub-type="ppub" publication-format="print"><month>7</month><year>2021</year></pub-date><pub-date date-type="pub" iso-8601-date="2021-07-23" pub-type="epub-original" publication-format="electronic"><day>23</day><month>7</month><year>2021</year></pub-date><pub-date iso-8601-date="2021-07-01T06:09:11-07:00" pub-type="hwp-received"><day>1</day><month>7</month><year>2021</year></pub-date><pub-date iso-8601-date="2021-07-01T06:09:11-07:00" pub-type="hwp-created"><day>1</day><month>7</month><year>2021</year></pub-date><volume>9</volume><issue>7</issue><elocation-id>e003044</elocation-id><history><date date-type="accepted" iso-8601-date="2021-06-14"><day>14</day><month>06</month><year>2021</year></date></history><permissions><copyright-statement>© Author(s) (or their employer(s)) 2021. Re-use permitted under CC BY-NC. No commercial re-use. See rights and permissions. Published by BMJ.</copyright-statement><copyright-year>2021</copyright-year><license license-type="open-access" xlink:href="http://creativecommons.org/licenses/by-nc/4.0/" xlink:type="simple"><ali:license_ref xmlns:ali="http://www.niso.org/schemas/ali/1.0/" start_date="2021-07-23">http://creativecommons.org/licenses/by-nc/4.0/</ali:license_ref><license-p>This is an open access article distributed in accordance with the Creative Commons Attribution Non Commercial (CC BY-NC 4.0) license, which permits others to distribute, remix, adapt, build upon this work non-commercially, and license their derivative works on different terms, provided the original work is properly cited, appropriate credit is given, any changes made indicated, and the use is non-commercial. See <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by-nc/4.0/" xlink:type="simple">http://creativecommons.org/licenses/by-nc/4.0/</ext-link>.</license-p></license></permissions><self-uri content-type="pdf" xlink:href="jitc-2021-003044.pdf" xlink:type="simple"/><related-article ext-link-type="doi" related-article-type="companion" xlink:href="10.1136/jitc-2021-003299" id="RA1" xlink:type="simple"/><abstract><p>A commentary on the original research article: ‘Radiomics analysis for predicting pembrolizumab response in patients with advanced rare cancers’. Of note, the predictor selection process, the cross-validation method, along with the lack of final testing of the developed model with a separated data set may mask overfitting, overestimating performance metrics.</p></abstract><kwd-group><kwd>immunotherapy</kwd></kwd-group><custom-meta-group><custom-meta xlink:type="simple"><meta-name>special-feature</meta-name><meta-value>unlocked</meta-value></custom-meta></custom-meta-group></article-meta></front><body><p>In the original research article, Colen <italic toggle="yes">et al</italic><xref ref-type="bibr" rid="R1">1</xref> use classic statistics and machine learning methods in order to identify significant radiomics features and predict pembrolizumab response in advanced rare cancers. This novel approach raises relevant hypotheses and may eventually prove useful in the expansion of the therapeutic arsenal for some patients.</p><p>However, the encouraging results obtained are, until further clarification, to be interpreted with caution. Machine learning is the software-mediated attempt to produce accurate output from previously unseen data through mostly automatic adjustment of parameters based on previous experience.<xref ref-type="bibr" rid="R2">2</xref> Effectively, the ‘learning’ step in this study occurs in a supervised fashion, that is, feeding the algorithm examples of labeled data (ie, the characteristics of each patient along with the label of ‘responder’ or ‘non-responder’). The learning algorithm then builds models to predict each patient’s label as accurately as possible.<xref ref-type="bibr" rid="R3">3</xref></p><p>After initial training, model validation is carried out. This is usually done by splitting the data set into training and validation sets: two groups with no overlapping patients, each used exclusively in their respective phase. To increase the model’s generalization capability and decrease any sample selection bias, resampling methods are used. Bootstrapping is the process of resampling data with replacement, usually producing several new groups of different training and test data sets, sometimes containing multiple instances of the same original cases, while omitting others. Cross-validation comprises resampling without replacement, systematically producing k surrogate data sets, with n original cases being part of the validation data set exactly once. This is called k-fold cross-validation. A special case is leave-one-out cross-validation (LOOCV), in which the training set consists in all cases but one, and the remaining case is used as a one-case validation set. The process repeats until all cases are separately used as validation. LOOCV is usually reserved for small data sets, in which the omission of a significant part of the training data (ie, 10%–20%) might hinder algorithm learning and thus performance.<xref ref-type="bibr" rid="R4">4</xref></p><p>Following the validation phase, the investigators may adjust the algorithm’s hyperparameters and try again until satisfactory performance is achieved. Since many changes are made to make the model more accurate for the validation data, overfitting may occur. This usually causes high performance metrics in the validation set, with poor prediction capability in a distinct dataset. To detect such phenomena, testing on sequestered, previously unseen data is performed, differences in model metrics are analyzed, methodology problems are addressed, and the process is repeated.<xref ref-type="bibr" rid="R4">4</xref></p><p>In the study, Colen <italic toggle="yes">et al</italic><xref ref-type="bibr" rid="R1">1</xref> address the objective with an admittedly small, but multidimensional patient data set, using LOOCV to assess model accuracy and C-statistic. However, caveats to their study design should be noted. Regarding feature selection, both in tables 3 and 4, multiple instances of the same feature in different levels of grayscale can be seen. While their relevance was reportedly identified by a sound method (L1 penalty), one cannot but wonder their collinearity (assessed by variance inflation factor<xref ref-type="bibr" rid="R5">5</xref>), and whether data preprocessing or the usage of other selection methods (wrapper or embedded methods) would change the outcomes. This must be carefully considered when small-n-large-p-problems, known to lead to feature selection instability, are involved.<xref ref-type="bibr" rid="R6 R7">6 7</xref> In relation to cross-validation, while LOOCV maximizes training data, testing a single point at a time implies a large variance in error and a similarly high variance of CIs. The method underestimates error rates, especially in small samples with high dimensionality (ie, few patients with several features), which can explain the reported results.<xref ref-type="bibr" rid="R8 R9">8 9</xref> The lack of cross-validation on blocks of correlated data may introduce another bias in the study: the algorithm might have been able to distinguish between different primary sites, and correlating tumor origin to outcome, always guessing the correct label, leading to accuracy, and C-statistic inflation. For example, penile carcinomas, small cell malignancies of non-pulmonary origin, and retroperitoneal spindle cell sarcoma had no responders in the sample, yielding always perfect predictions of no response in the test, while this might not hold true in external validation.<xref ref-type="bibr" rid="R9">9</xref></p><p>Additionally, the lack of testing in a separated set after cross-validation hinders the credibility of the outstanding metrics achieved—at least until independent verification.<xref ref-type="bibr" rid="R8">8</xref></p><p>In order to address the outlined issues, the following procedures might be applied: assessment of feature collinearity and usage of different methods of feature selection might help with the small-n-large-p-problem; and the separation of lesions (in case of metastatic sites) into distinct data points, as well as data amplification methods (such as synthetic minority over-sampling technique<xref ref-type="bibr" rid="R10">10</xref>) may help increase the data set. After a larger amount of data is achieved, other resampling strategies (k-fold cross-validation or bootstrapping) may be employed, and more data (with no synthetic points) can be spared for final testing and overfitting assessment. The analysis of one lesion may not be a surrogate marker for cancer response to immunotherapy, but it may be an interesting hypothesis generator. Also, other predictive models for treatment response based on voting on the probability of response for each tumor in a patient may be developed from the original algorithms.</p></body><back><fn-group><fn fn-type="other"><label>Twitter</label><p>@MateusTrinconi, @RMaffeiLoureiro, @GilbertodeCas13</p></fn><fn fn-type="other"><label>Contributors</label><p>All author contributed equally to letter conception, design, data analysis and interpretation, manuscript writing, and final approval.</p></fn><fn fn-type="other"><label>Funding</label><p>The authors have not declared a specific grant for this research from any funding agency in the public, commercial or not-for-profit sectors.</p></fn><fn fn-type="conflict"><label>Competing interests</label><p>GdC has received personal fees from AstraZeneca, Bayer, Bristol-Myers Squibb, Boehringer Ingelheim, Janssen, Lilly, Merck Serono, Merck Sharp and Dohme, Novartis, Pfizer, Roche, Teva, and Yuhan, none related to this publication.</p></fn><fn fn-type="other"><label>Provenance and peer review</label><p>Commissioned; internally peer reviewed.</p></fn></fn-group><sec sec-type="ethics-statement"><title>Ethics statements</title><sec sec-type="ethics-consent-to-publish"><title>Patient consent for publication</title><p>Not required.</p></sec></sec><ref-list><title>References</title><ref id="R1"><label>1</label><mixed-citation publication-type="journal" xlink:type="simple"><person-group person-group-type="author"><string-name name-style="western"><surname>Colen</surname> <given-names>RR</given-names></string-name>, <string-name name-style="western"><surname>Rolfo</surname> <given-names>C</given-names></string-name>, <string-name name-style="western"><surname>Ak</surname> <given-names>M</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Radiomics analysis for predicting pembrolizumab response in patients with advanced rare cancers</article-title>. <source>J Immunother Cancer</source> <year>2021</year>;<volume>9</volume>:<elocation-id>e001752</elocation-id>. <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1136/jitc-2020-001752" xlink:type="simple">doi:10.1136/jitc-2020-001752</ext-link><pub-id pub-id-type="pmid" xlink:type="simple">http://www.ncbi.nlm.nih.gov/pubmed/33849924</pub-id></mixed-citation></ref><ref id="R2"><label>2</label><mixed-citation publication-type="book" xlink:type="simple"><person-group person-group-type="author"><string-name name-style="western"><surname>Koza</surname> <given-names>JR</given-names></string-name>, <string-name name-style="western"><surname>Bennett</surname> <given-names>FH</given-names></string-name>, <string-name name-style="western"><surname>Andre</surname> <given-names>D</given-names></string-name>, <etal>et al</etal></person-group>. <chapter-title>Automated Design of Both the Topology and Sizing of Analog Electrical Circuits Using Genetic Programming</chapter-title>. <comment>In</comment>: <person-group person-group-type="editor"><string-name name-style="western"><surname>Gero</surname> <given-names>JS</given-names></string-name>, <string-name name-style="western"><surname>Sudweeks</surname> <given-names>F</given-names></string-name></person-group>, <comment>eds</comment>. <source>Artificial Intelligence in Design ’96</source>. <publisher-loc>Dordrecht</publisher-loc>: <publisher-name>Springer Netherlands</publisher-name>, <year>1996</year>: <fpage>151</fpage>–<lpage>70</lpage>.</mixed-citation></ref><ref id="R3"><label>3</label><mixed-citation publication-type="confproc" xlink:type="simple"><person-group person-group-type="author"><string-name name-style="western"><surname>Chen</surname> <given-names>T</given-names></string-name>, <string-name name-style="western"><surname>Guestrin</surname> <given-names>C</given-names></string-name>, <collab xlink:type="simple">Association for Computing Machinery</collab></person-group>. <article-title>XGBoost: a scalable tree boosting system</article-title>. <conf-name>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name>, <conf-loc>New York, NY, USA</conf-loc>, <year>2016</year>:<fpage>785</fpage>–<lpage>94</lpage>.</mixed-citation></ref><ref id="R4"><label>4</label><mixed-citation publication-type="book" xlink:type="simple"><person-group person-group-type="author"><string-name name-style="western"><surname>Burkov</surname> <given-names>A</given-names></string-name></person-group>. <chapter-title>Basic Practice</chapter-title>. <comment>In</comment>: <source>The Hundred-Page machine learning book</source>, <year>2019</year>. <uri xlink:href="http://themlbook.com/wiki/doku.php" xlink:type="simple">http://themlbook.com/wiki/doku.php</uri></mixed-citation></ref><ref id="R5"><label>5</label><mixed-citation publication-type="journal" xlink:type="simple"><person-group person-group-type="author"><string-name name-style="western"><surname>Grønning</surname> <given-names>B</given-names></string-name>, <string-name name-style="western"><surname>Nilsson</surname> <given-names>JC</given-names></string-name></person-group>. <article-title>Multiple regression: a primer</article-title>. <source>Stat Med</source> <year>2001</year>;<volume>20</volume>:<fpage>1888</fpage>–<lpage>9</lpage>.</mixed-citation></ref><ref id="R6"><label>6</label><mixed-citation publication-type="journal" xlink:type="simple"><person-group person-group-type="author"><string-name name-style="western"><surname>Jain</surname> <given-names>A</given-names></string-name>, <string-name name-style="western"><surname>Zongker</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Feature selection: evaluation, application, and small sample performance</article-title>. <source>IEEE Trans Pattern Anal Mach Intell</source> <year>1997</year>;<volume>19</volume>:<fpage>153</fpage>–<lpage>8</lpage>.<ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1109/34.574797" xlink:type="simple">doi:10.1109/34.574797</ext-link></mixed-citation></ref><ref id="R7"><label>7</label><mixed-citation publication-type="journal" xlink:type="simple"><person-group person-group-type="author"><string-name name-style="western"><surname>He</surname> <given-names>Z</given-names></string-name>, <string-name name-style="western"><surname>Yu</surname> <given-names>W</given-names></string-name></person-group>. <article-title>Stable feature selection for biomarker discovery</article-title>. <source>Comput Biol Chem</source> <year>2010</year>;<volume>34</volume>:<fpage>215</fpage>–<lpage>25</lpage>.<ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1016/j.compbiolchem.2010.07.002" xlink:type="simple">doi:10.1016/j.compbiolchem.2010.07.002</ext-link><pub-id pub-id-type="pmid" xlink:type="simple">http://www.ncbi.nlm.nih.gov/pubmed/20702140</pub-id></mixed-citation></ref><ref id="R8"><label>8</label><mixed-citation publication-type="confproc" xlink:type="simple"><person-group person-group-type="author"><string-name name-style="western"><surname>Rao</surname> <given-names>RB</given-names></string-name>, <string-name name-style="western"><surname>Fung</surname> <given-names>G</given-names></string-name>, <string-name name-style="western"><surname>Rosales</surname> <given-names>R</given-names></string-name>, <collab xlink:type="simple">Society for Industrial and Applied Mathematics</collab></person-group>. <article-title>On the dangers of cross-validation. An experimental evaluation</article-title>. <conf-name>Proceedings of the 2008 SIAM International Conference on Data Mining</conf-name>, <conf-loc>Philadelphia, PA</conf-loc>, <year>2008</year>:<fpage>588</fpage>–<lpage>96</lpage>.</mixed-citation></ref><ref id="R9"><label>9</label><mixed-citation publication-type="journal" xlink:type="simple"><person-group person-group-type="author"><string-name name-style="western"><surname>Varoquaux</surname> <given-names>G</given-names></string-name>, <string-name name-style="western"><surname>Raamana</surname> <given-names>PR</given-names></string-name>, <string-name name-style="western"><surname>Engemann</surname> <given-names>DA</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Assessing and tuning brain decoders: cross-validation, caveats, and guidelines</article-title>. <source>Neuroimage</source> <year>2017</year>;<volume>145</volume>:<fpage>166</fpage>–<lpage>79</lpage>.<ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1016/j.neuroimage.2016.10.038" xlink:type="simple">doi:10.1016/j.neuroimage.2016.10.038</ext-link><pub-id pub-id-type="pmid" xlink:type="simple">http://www.ncbi.nlm.nih.gov/pubmed/27989847</pub-id></mixed-citation></ref><ref id="R10"><label>10</label><mixed-citation publication-type="journal" xlink:type="simple"><person-group person-group-type="author"><string-name name-style="western"><surname>Chawla</surname> <given-names>NV</given-names></string-name>, <string-name name-style="western"><surname>Bowyer</surname> <given-names>KW</given-names></string-name>, <string-name name-style="western"><surname>Hall</surname> <given-names>LO</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>SMOTE: synthetic minority Over-sampling technique</article-title>. <source>Jair</source> <year>2002</year>;<volume>16</volume>:<fpage>321</fpage>–<lpage>57</lpage>.<ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1613/jair.953" xlink:type="simple">doi:10.1613/jair.953</ext-link></mixed-citation></ref></ref-list></back></article>