@inproceedings{falis-etal-2022-horses,
title = "Horses to Zebras: Ontology-Guided Data Augmentation and Synthesis for {ICD}-9 Coding",
author = "Falis, Mat{\'u}{\v{s}} and
Dong, Hang and
Birch, Alexandra and
Alex, Beatrice",
editor = "Demner-Fushman, Dina and
Cohen, Kevin Bretonnel and
Ananiadou, Sophia and
Tsujii, Junichi",
booktitle = "Proceedings of the 21st Workshop on Biomedical Language Processing",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.bionlp-1.39",
doi = "10.18653/v1/2022.bionlp-1.39",
pages = "389--401",
abstract = "Medical document coding is the process of assigning labels from a structured label space (ontology {--} e.g., ICD-9) to medical documents. This process is laborious, costly, and error-prone. In recent years, efforts have been made to automate this process with neural models. The label spaces are large (in the order of thousands of labels) and follow a big-head long-tail label distribution, giving rise to few-shot and zero-shot scenarios. Previous efforts tried to address these scenarios within the model, leading to improvements on rare labels, but worse results on frequent ones. We propose data augmentation and synthesis techniques in order to address these scenarios. We further introduce an analysis technique for this setting inspired by confusion matrices. This analysis technique points to the positive impact of data augmentation and synthesis, but also highlights more general issues of confusion within families of codes, and underprediction.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="falis-etal-2022-horses">
<titleInfo>
<title>Horses to Zebras: Ontology-Guided Data Augmentation and Synthesis for ICD-9 Coding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Matúš</namePart>
<namePart type="family">Falis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hang</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandra</namePart>
<namePart type="family">Birch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Beatrice</namePart>
<namePart type="family">Alex</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st Workshop on Biomedical Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="given">Bretonnel</namePart>
<namePart type="family">Cohen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Medical document coding is the process of assigning labels from a structured label space (ontology – e.g., ICD-9) to medical documents. This process is laborious, costly, and error-prone. In recent years, efforts have been made to automate this process with neural models. The label spaces are large (in the order of thousands of labels) and follow a big-head long-tail label distribution, giving rise to few-shot and zero-shot scenarios. Previous efforts tried to address these scenarios within the model, leading to improvements on rare labels, but worse results on frequent ones. We propose data augmentation and synthesis techniques in order to address these scenarios. We further introduce an analysis technique for this setting inspired by confusion matrices. This analysis technique points to the positive impact of data augmentation and synthesis, but also highlights more general issues of confusion within families of codes, and underprediction.</abstract>
<identifier type="citekey">falis-etal-2022-horses</identifier>
<identifier type="doi">10.18653/v1/2022.bionlp-1.39</identifier>
<location>
<url>https://aclanthology.org/2022.bionlp-1.39</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>389</start>
<end>401</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Horses to Zebras: Ontology-Guided Data Augmentation and Synthesis for ICD-9 Coding
%A Falis, Matúš
%A Dong, Hang
%A Birch, Alexandra
%A Alex, Beatrice
%Y Demner-Fushman, Dina
%Y Cohen, Kevin Bretonnel
%Y Ananiadou, Sophia
%Y Tsujii, Junichi
%S Proceedings of the 21st Workshop on Biomedical Language Processing
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F falis-etal-2022-horses
%X Medical document coding is the process of assigning labels from a structured label space (ontology – e.g., ICD-9) to medical documents. This process is laborious, costly, and error-prone. In recent years, efforts have been made to automate this process with neural models. The label spaces are large (in the order of thousands of labels) and follow a big-head long-tail label distribution, giving rise to few-shot and zero-shot scenarios. Previous efforts tried to address these scenarios within the model, leading to improvements on rare labels, but worse results on frequent ones. We propose data augmentation and synthesis techniques in order to address these scenarios. We further introduce an analysis technique for this setting inspired by confusion matrices. This analysis technique points to the positive impact of data augmentation and synthesis, but also highlights more general issues of confusion within families of codes, and underprediction.
%R 10.18653/v1/2022.bionlp-1.39
%U https://aclanthology.org/2022.bionlp-1.39
%U https://doi.org/10.18653/v1/2022.bionlp-1.39
%P 389-401
Markdown (Informal)
[Horses to Zebras: Ontology-Guided Data Augmentation and Synthesis for ICD-9 Coding](https://aclanthology.org/2022.bionlp-1.39) (Falis et al., BioNLP 2022)
ACL