diff --git a/.gitignore b/.gitignore index 15f1423..2c32756 100644 --- a/.gitignore +++ b/.gitignore @@ -130,6 +130,7 @@ dmypy.json # Results/Logs results/* +results*/* logs/*.log # toml files diff --git a/anemic/__init__.py b/anemic/__init__.py new file mode 100644 index 0000000..cac0ba3 --- /dev/null +++ b/anemic/__init__.py @@ -0,0 +1,7 @@ +from anemic import ( + datasets, + models, + modules, + trainers, + utils, +) diff --git a/anemic/datasets/__init__.py b/anemic/datasets/__init__.py new file mode 100644 index 0000000..047b697 --- /dev/null +++ b/anemic/datasets/__init__.py @@ -0,0 +1 @@ +from anemic.datasets import base_dataset diff --git a/src/datasets/base_dataset.py b/anemic/datasets/base_dataset.py similarity index 95% rename from src/datasets/base_dataset.py rename to anemic/datasets/base_dataset.py index 912828c..43eda14 100644 --- a/src/datasets/base_dataset.py +++ b/anemic/datasets/base_dataset.py @@ -5,9 +5,9 @@ import torch from torch.utils.data import Dataset -from src.utils.file_loaders import load_csv_as_df, load_json -from src.utils.mapper import ConfigMapper -from src.utils.text_loggers import get_logger +from anemic.utils.file_loaders import load_csv_as_df, load_json +from anemic.utils.mapper import ConfigMapper +from anemic.utils.text_loggers import get_logger logger = get_logger(__name__) diff --git a/anemic/models/__init__.py b/anemic/models/__init__.py new file mode 100644 index 0000000..2457c48 --- /dev/null +++ b/anemic/models/__init__.py @@ -0,0 +1,8 @@ +from anemic.models import ( + auto_models, + caml, + dcan, + fusion, + multirescnn, + transicd, +) diff --git a/anemic/models/auto_models.py b/anemic/models/auto_models.py new file mode 100644 index 0000000..ec38066 --- /dev/null +++ b/anemic/models/auto_models.py @@ -0,0 +1 @@ +from anemic.utils.mapper import ConfigMapper diff --git a/src/models/caml.py b/anemic/models/caml.py similarity index 98% rename from src/models/caml.py rename to anemic/models/caml.py index 8828292..73c79c6 100644 --- a/src/models/caml.py +++ b/anemic/models/caml.py @@ -12,9 +12,9 @@ from torch.autograd import Variable from torch.nn.init import xavier_uniform -from src.utils.model_utils import load_lookups, pad_desc_vecs -from src.utils.mapper import ConfigMapper -from src.utils.text_loggers import get_logger +from anemic.utils.mapper import ConfigMapper +from anemic.utils.model_utils import load_lookups, pad_desc_vecs +from anemic.utils.text_loggers import get_logger logger = get_logger(__name__) diff --git a/anemic/models/dcan.py b/anemic/models/dcan.py new file mode 100644 index 0000000..f46c6ba --- /dev/null +++ b/anemic/models/dcan.py @@ -0,0 +1,507 @@ +import copy + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.init import xavier_uniform_ +from torch.nn.utils import weight_norm as weight_norm_ + +from anemic.utils.mapper import ConfigMapper +from anemic.utils.text_loggers import get_logger + +logger = get_logger(__name__) + + +@ConfigMapper.map("models", "dcan") +class DCAN(nn.Module): + """ + This class is used to create the DCAN model. + References: + Paper: https://aclanthology.org/2020.clinicalnlp-1.8/ + GitHub Repository: https://github.com/shaoxiongji/DCAN + For the parameters related to convolutional layers, please see this: + https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html. + Args: + num_classes (int): Number of classes (ICD codes). + conv_channel_sizes (list): List of lists of integers. Each list + represents the channel sizes of convolutional + layers in a `TemporalBlock`. So, for example, + if the list is [[100, 600, 600], + [600, 600, 600]]. + the `TemporalConvNet` layer will have 2 + `TemporalBlock`s, each temporal block have + 2 convolutional layers: + Conv(100, 600), Conv(600, 600) for the first + one, and Conv(600, 600), Conv(600, 600). If + the `add_emb_size_to_channel_sizes`, we don't + have to pass the input channel size. So, in + the above case, we can just pass + [[600, 600], [600, 600, 600]]. + add_emb_size_to_channel_sizes (bool): If True, you need not specify + the input channel size. Please + see the description of + `conv_channel_sizes`. + kernel_sizes (list): List of list of integers (same format as + `conv_channel_sizes`). Each integer represents the + kernel size/filter size of the respective + convolutional layer in `TemporalBlock` layer. + strides (list): List of list of integers (same format as + `conv_channel_sizes`). Each integer represents the + stride of the respective convolutional layer in + `TemporalBlock` layer. + paddings (list): List of list of integers (same format as + `conv_channel_sizes`). Each integer represents the + padding of the respective convolutional layer in + `TemporalBlock` layer. in DCAN, this value is set to + "(kernel_size - 1) * dilation_size". + dilations (list): List of list of integers (same format as + `conv_channel_sizes`). Each integer represents the + dilation size of the respective convolutional layer + `TemporalBlock` layer.` In DCAN, this value is + "2^(temporal_block_level)". + dropouts (list): List of list of floats (same format as + `conv_channel_sizes`). Each float represents the + dropout probability of the respective convolutional + `TemporalBlock` layer. + weight_norm (bool): If True, apply weight normalization to the + convolutional layers. + activation (str): Activation function to use. Should be one of "relu", + "elu", "leaky_relu". + """ + + def __init__(self, config): + super(DCAN, self).__init__() + logger.info(f"Initialising {self.__class__.__name__}") + logger.debug( + f"Initialising {self.__class__.__name__} with " f"config: {config}" + ) + + self.config = config + + self.word_embedding_layer = WordEmbeddingLayer( + **config.word_representation_layer.params.init_params.as_dict() + ) + if config.word_representation_layer.params.freeze_layer: + self.freeze_layer(self.word_embedding_layer.embed) + + num_levels = len(config.kernel_sizes) + num_inner_conv_levels = len(config.kernel_sizes[0]) + + conv_channel_sizes = copy.deepcopy(config.conv_channel_sizes) + if config.add_emb_size_to_channel_sizes: + conv_channel_sizes[0] = [ + self.word_embedding_layer.embedding_size + ] + conv_channel_sizes[0] + dropouts = [ + [config.dropout for _ in range(num_inner_conv_levels)] + for _ in range(num_levels) + ] + + self.temporal_conv_net = TemporalConvNet( + conv_channel_sizes_=conv_channel_sizes, + kernel_sizes_=config.kernel_sizes, + strides_=config.strides, + paddings_=config.paddings, + dilations_=config.dilations, + dropouts_=dropouts, + weight_norm=config.weight_norm, + activation=config.activation, + ) + + self.linear_layer = nn.Linear( + conv_channel_sizes[-1][-1], config.projection_size + ) + self.activation = ConfigMapper.get_object( + "activations", config.activation + )() + + self.output_layer = OutputLayer( + config.projection_size, config.num_classes + ) + + xavier_uniform_(self.linear_layer.weight) + + def forward(self, data): + x = self.word_embedding_layer(data) + hid_seq = self.temporal_conv_net(x.transpose(1, 2)).transpose(1, 2) + hid_seq = self.activation(self.linear_layer(hid_seq)) + logits = self.output_layer(hid_seq) + return logits + + def freeze_layer(self, layer): + for param in layer.parameters(): + param.requires_grad = False + + def get_input_attention(self): + # Use the attention score computed in the forward pass + return self.output_layer.label_wise_attn.alpha.cpu().detach().numpy() + + +class OutputLayer(nn.Module): + def __init__(self, input_size, num_classes): + super(OutputLayer, self).__init__() + self.label_wise_attn = LabelWiseAttn(input_size, num_classes) + + self.final = nn.Linear(input_size, num_classes) + xavier_uniform_(self.final.weight) + + def forward(self, x): + m = self.label_wise_attn(x) + logits = self.final.weight.mul(m).sum(dim=2).add(self.final.bias) + return logits + + +class WordEmbeddingLayer(nn.Module): + """ + A Word Embedding Layer. This layer loads a pre-trained word embedding matrix + , and copies its weights to an nn.Embedding layer. + + Args: + embed_dir (str): A directory containing the pre-trained word embedding + matrix, among other things. Please see + https://github.com/dalgu90/icd-coding-benchmark/blob/main/anemic/modules/embeddings.py#L17 + for more details. + dropout (float): The dropout probability. + """ + + def __init__(self, embed_dir, dropout): + super(WordEmbeddingLayer, self).__init__() + logger.debug( + f"Initialising {self.__class__.__name__} with " + f"embed_dir = {embed_dir}, dropout = {dropout}" + ) + + # Note: This should be changed, since we won't always use Word2Vec. + embedding_cls = ConfigMapper.get_object("embeddings", "word2vec") + + W = torch.Tensor(embedding_cls.load_emb_matrix(embed_dir)) + self.embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0) + self.embed.weight.data = W.clone() + + self.embedding_size = self.embed.embedding_dim + + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + embedding = self.embed(x) + x = self.dropout(embedding) + return x + + +class Chomp1d(nn.Module): + def __init__(self, chomp_size): + super(Chomp1d, self).__init__() + self.chomp_size = chomp_size + + def forward(self, x): + return x[:, :, : -self.chomp_size].contiguous() + + +class ConvTemporalSubBlock(nn.Module): + """ + A simple temporal convolutional block. Adapted from + https://github.com/shaoxiongji/DCAN/blob/master/models.py#L84-L88. This + layer has a dilated convolutional layer, a `chomp1d` layer, followed by + activation and dropout. For the parameters related to convolutional layers, + please see this: + https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html. + + Args: + in_channels (int): The number of input channels in the convolutional + layer. + out_channels (int): The number of output channels in the convolutional + layer. + kernel_size (int): The size of the kernel in the convolutional layer. + stride (int): The stride of the convolutional layer. + padding (int): The padding of the convolutional layer. + dilation (int): The dilation size of the convolutional layer. + dropout (float): The dropout probability. + weight_norm (bool): Whether to apply weight normalization to the + convolutional layer. + activation (str): The activation function to use. DCAN uses "relu". + For all available activations, see + https://github.com/dalgu90/icd-coding-benchmark/blob/main/anemic/modules/activations.py. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + dropout=0.2, + weight_norm=True, + activation="relu", + ): + super(ConvTemporalSubBlock, self).__init__() + logger.debug( + f"Initialising {self.__class__.__name__} with " + f"in_channels = {in_channels}, out_channels = " + f"{out_channels}, kernel_size = {kernel_size}, " + f"stride = {stride}, padding = {padding}, " + f"dilation = {dilation}, dropout = {dropout}, " + f"weight_norm = {weight_norm}, activation = {activation}" + ) + + self.conv_layer = nn.Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + ) + if weight_norm: + self.conv_layer = weight_norm_(self.conv_layer) + self.chomp1d = Chomp1d(padding) + self.activation = ConfigMapper.get_object("activations", activation)() + self.dropout = nn.Dropout(dropout) + + self.__init_weights__() + + def __init_weights__(self): + xavier_uniform_(self.conv_layer.weight) + + def forward(self, x): + x = self.conv_layer(x) + x = self.chomp1d(x) + x = self.activation(x) + x = self.dropout(x) + return x + + +class TemporalBlock(nn.Module): + """ + A Temporal Block containing stacks of `ConvTemporalSubBlocks`, followed + by activation. + References: + Paper: https://arxiv.org/abs/2009.14578 + Repository: https://github.com/shaoxiongji/DCAN/blob/master/models.py#L81 + + Args: + conv_channel_sizes (list): List of integers, with channel sizes of + convolutional layers. For example, if the + list is [100, 200, 300], there will be two + convolutional layers: Conv1d(100, 200) and + Conv1d(200, 300). + kernel_sizes (list): List of integers, with kernel sizes of every + `ConvTemporalSubBlock`. + strides (list): List of integers, with strides of convolutional layers. + paddings (list): List of integers, with paddings of every + `ConvTemporalSubBlock`. + dilations (list): List of integers, with dilation sizes of every + `ConvTemporalSubBlock`. + dropouts (list): List of floats, with dropout probabilities of every + `ConvTemporalSubBlock`. + weight_norm (bool): Whether to apply weight normalization to every + convolutional layer. DCAN uses weight norm. + activation (str): The activation function to use. DCAN uses "relu". + """ + + def __init__( + self, + conv_channel_sizes, + kernel_sizes, + strides, + paddings, + dilations, + dropouts, + weight_norm=True, + activation="relu", + ): + super(TemporalBlock, self).__init__() + conv_channel_size_pairs = list( + zip(conv_channel_sizes[:-1], conv_channel_sizes[1:]) + ) + + self.conv_temporal_sub_blocks = nn.ModuleList( + [ + ConvTemporalSubBlock( + in_channels=conv_channel_size_pair[0], + out_channels=conv_channel_size_pair[1], + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + dropout=dropout, + weight_norm=weight_norm, + activation=activation, + ) + for ( + conv_channel_size_pair, + kernel_size, + stride, + padding, + dilation, + dropout, + ) in zip( + conv_channel_size_pairs, + kernel_sizes, + strides, + paddings, + dilations, + dropouts, + ) + ] + ) + + self.downsample = ( + nn.Conv1d(conv_channel_sizes[0], conv_channel_sizes[-1], 1) + if conv_channel_sizes[0] != conv_channel_sizes[-1] + else None + ) + self.output_activation = ConfigMapper.get_object( + "activations", activation + )() + + self.init_weights() + + def init_weights(self): + if self.downsample is not None: + xavier_uniform_(self.downsample.weight) + + def forward(self, x): + conv_layer_output = x + for conv_temporal_sub_block in self.conv_temporal_sub_blocks: + conv_layer_output = conv_temporal_sub_block(conv_layer_output) + res = x if self.downsample is None else self.downsample(x) + return self.output_activation(conv_layer_output + res) + + +class TemporalConvNet(nn.Module): + """ + Stack of `TemporalBlock`s. Used in the DCAN model. + References: + Paper: https://arxiv.org/abs/2009.14578 + Repository: https://github.com/shaoxiongji/DCAN/blob/master/models.py#L114 + + Args: + conv_channel_sizes_ (list): List of lists of integers. Each list + represents the channel sizes of convolutional + layers in a `TemporalBlock`. So, for + example, if the list is [[100, 600, 600], + [600, 600, 600]]. + the `TemporalConvNet` layer will have 2 + `TemporalBlock`s, each temporal block have + 2 convolutional layers: + Conv(100, 600), Conv(600, 600) for the first + one, and Conv(600, 600), Conv(600, 600). If + the `add_emb_size_to_channel_sizes`, we + don't have to pass the input channel size. + So, in the above case, we can just pass + [[600, 600], [600, 600, 600]]. + kernel_sizes_ (list): List of list of integers (same format as + `conv_channel_sizes`). Each integer represents the + kernel size/filter size of the respective + convolutional layer in `TemporalBlock` layer. + strides_ (list): List of list of integers (same format as + `conv_channel_sizes`). Each integer represents the + stride of the respective convolutional layer in + `TemporalBlock` layer. + paddings_ (list): List of list of integers (same format as + `conv_channel_sizes`). Each integer represents the + padding of the respective convolutional layer in + `TemporalBlock` layer. in DCAN, this value is set to + "(kernel_size - 1) * dilation_size". + dilations_ (list): List of list of integers (same format as + `conv_channel_sizes`). Each integer represents the + dilation size of the respective convolutional layer + `TemporalBlock` layer.` In DCAN, this value is + "2^(temporal_block_level)". + dropouts_ (list): List of list of floats (same format as + `conv_channel_sizes`). Each float represents the + dropout probability of the respective convolutional + `TemporalBlock` layer. + weight_norm (bool): If True, apply weight normalization to the + convolutional layers. + activation (str): Activation function to use. DCAN uses "relu". + """ + + def __init__( + self, + conv_channel_sizes_, + kernel_sizes_, + strides_, + paddings_, + dilations_, + dropouts_, + weight_norm=True, + activation="relu", + ): + super(TemporalConvNet, self).__init__() + logger.debug( + f"Initialising {self.__class__.__name__} with " + f"conv_channel_sizes_ = {conv_channel_sizes_}, " + f"kernel_sizes_ = {kernel_sizes_}, " + f"strides_ = {strides_}, paddings_ = {paddings_}, " + f"dilations_ = {dilations_}, dropouts_ = {dropouts_}, " + f"weight_norm = {weight_norm}, activation = {activation}" + ) + + self.temporal_blocks = nn.ModuleList( + [ + TemporalBlock( + conv_channel_sizes=conv_channel_sizes, + kernel_sizes=kernel_sizes, + strides=strides, + paddings=paddings, + dilations=dilations, + dropouts=dropouts, + weight_norm=weight_norm, + activation=activation, + ) + for ( + conv_channel_sizes, + kernel_sizes, + strides, + paddings, + dilations, + dropouts, + ) in zip( + conv_channel_sizes_, + kernel_sizes_, + strides_, + paddings_, + dilations_, + dropouts_, + ) + ] + ) + + def forward(self, x): + for temporal_block in self.temporal_blocks: + x = temporal_block(x) + return x + + +class LabelWiseAttn(nn.Module): + """ + A Label-wise Attention layer (as implemented in CAML, DCAN, etc.). + References: + Papers: https://arxiv.org/abs/1802.05695 (Section 2.2) + Repository: https://github.com/jamesmullenbach/caml-mimic/blob/master/learn/models.py#L184 + + Args: + input_size (int): The size of the input, i.e., the number of channels + if the output is from a convolutional layer/embedding + size if the output is from a fully connected layer. + num_classes (int): The number of classes. + """ + + def __init__(self, input_size, num_classes): + super(LabelWiseAttn, self).__init__() + logger.debug( + f"Initialising {self.__class__.__name__} with " + f"input size = {input_size}, num_classes = {num_classes}" + ) + + self.U = nn.Linear(input_size, num_classes) + xavier_uniform_(self.U.weight) + + def forward(self, x): + att = self.U.weight.matmul(x.transpose(1, 2)) # [bs, Y, seq_len] + self.alpha = F.softmax(att, dim=2) + m = self.alpha.matmul(x) # [bs, Y, dim] + return m diff --git a/src/models/fusion.py b/anemic/models/fusion.py similarity index 98% rename from src/models/fusion.py rename to anemic/models/fusion.py index e2a5d82..4c154d4 100755 --- a/src/models/fusion.py +++ b/anemic/models/fusion.py @@ -11,9 +11,9 @@ import torch.nn.functional as F from torch.nn.init import xavier_uniform_ as xavier_uniform -from src.utils.model_utils import load_lookups -from src.utils.mapper import ConfigMapper -from src.utils.text_loggers import get_logger +from anemic.utils.mapper import ConfigMapper +from anemic.utils.model_utils import load_lookups +from anemic.utils.text_loggers import get_logger logger = get_logger(__name__) @@ -417,7 +417,7 @@ class Fusion(nn.Module): def __init__(self, config): super(Fusion, self).__init__() - logger.info(f"Initialising %s", self.__class__.__name__) + logger.info("Initialising %s", self.__class__.__name__) logger.debug( "Initialising %s with config: %s", self.__class__.__name__, config ) diff --git a/src/models/multirescnn.py b/anemic/models/multirescnn.py similarity index 98% rename from src/models/multirescnn.py rename to anemic/models/multirescnn.py index 95b1241..1b6b147 100644 --- a/src/models/multirescnn.py +++ b/anemic/models/multirescnn.py @@ -1,5 +1,6 @@ """ -ICD Coding from Clinical Text Using Multi-Filter Residual Convolutional Neural Network, 2020 +ICD Coding from Clinical Text Using Multi-Filter Residual Convolutional Neural +Network, 2020 https://github.com/foxlf823/Multi-Filter-Residual-Convolutional-Neural-Network """ @@ -10,9 +11,9 @@ import torch.nn.functional as F from torch.nn.init import xavier_uniform_ as xavier_uniform -from src.utils.mapper import ConfigMapper -from src.utils.model_utils import load_lookups -from src.utils.text_loggers import get_logger +from anemic.utils.mapper import ConfigMapper +from anemic.utils.model_utils import load_lookups +from anemic.utils.text_loggers import get_logger logger = get_logger(__name__) diff --git a/src/models/transicd/model.py b/anemic/models/transicd.py similarity index 96% rename from src/models/transicd/model.py rename to anemic/models/transicd.py index a337a07..5c9b62f 100644 --- a/src/models/transicd/model.py +++ b/anemic/models/transicd.py @@ -4,8 +4,8 @@ import torch.nn as nn from torch.autograd import Variable -from src.utils.mapper import ConfigMapper -from src.utils.text_loggers import get_logger +from anemic.utils.mapper import ConfigMapper +from anemic.utils.text_loggers import get_logger logger = get_logger(__name__) @@ -114,7 +114,7 @@ def forward(self, inputs): # outputs = torch.zeros(batch_size, self.num_classes).to(inputs.device) # for code, ff_layer in enumerate(self.ff_layers): - # outputs[:, code : code + 1] = ff_layer(weighted_outputs[:, code, :]) + # outputs[:, code : code + 1] = ff_layer(weighted_outputs[:, code]) # Trick: Use one linear layer as per-code linear layers outputs = (weighted_outputs * self.ff_layer.weight).sum(axis=2) @@ -135,7 +135,7 @@ class WordEmbeddingLayer(nn.Module): Args: embed_dir (str): A directory containing the pre-trained word embedding matrix, among other things. Please see - https://github.com/dalgu90/icd-coding-benchmark/blob/main/src/modules/embeddings.py#L17 + https://github.com/dalgu90/icd-coding-benchmark/blob/main/anemic/modules/embeddings.py#L17 for more details. """ @@ -223,9 +223,7 @@ class LabelAttentionLayer(nn.Module): Defaults to 2. """ - def __init__( - self, embed_size=128, num_classes=50, attn_expansion=2 - ): + def __init__(self, embed_size=128, num_classes=50, attn_expansion=2): super(LabelAttentionLayer, self).__init__() logger.debug( f"Initialising {self.__class__.__name__} with " diff --git a/anemic/modules/__init__.py b/anemic/modules/__init__.py new file mode 100755 index 0000000..e16fe6a --- /dev/null +++ b/anemic/modules/__init__.py @@ -0,0 +1,12 @@ +from anemic.modules import ( + activations, + dataset_splitters, + embeddings, + losses, + metrics, + optimizers, + preprocessing_pipelines, + preprocessors, + schedulers, + tokenizers, +) diff --git a/src/modules/activations.py b/anemic/modules/activations.py similarity index 80% rename from src/modules/activations.py rename to anemic/modules/activations.py index 8433949..fb03ee4 100755 --- a/src/modules/activations.py +++ b/anemic/modules/activations.py @@ -1,6 +1,6 @@ import torch.nn as nn -from src.utils.mapper import ConfigMapper +from anemic.utils.mapper import ConfigMapper ConfigMapper.map("activations", "relu")(nn.ReLU) ConfigMapper.map("activations", "logsoftmax")(nn.LogSoftmax) diff --git a/src/modules/dataset_splitters.py b/anemic/modules/dataset_splitters.py similarity index 87% rename from src/modules/dataset_splitters.py rename to anemic/modules/dataset_splitters.py index 54a9a6c..2ec195f 100644 --- a/src/modules/dataset_splitters.py +++ b/anemic/modules/dataset_splitters.py @@ -1,9 +1,9 @@ import logging import os -from src.utils.file_loaders import load_json -from src.utils.mapper import ConfigMapper -from src.utils.text_loggers import get_logger +from anemic.utils.file_loaders import load_json +from anemic.utils.mapper import ConfigMapper +from anemic.utils.text_loggers import get_logger logger = get_logger(__name__) diff --git a/src/modules/embeddings.py b/anemic/modules/embeddings.py similarity index 93% rename from src/modules/embeddings.py rename to anemic/modules/embeddings.py index 3b31c1e..ce83881 100755 --- a/src/modules/embeddings.py +++ b/anemic/modules/embeddings.py @@ -6,9 +6,9 @@ import gensim import numpy as np -from src.utils.file_loaders import load_json, save_json -from src.utils.mapper import ConfigMapper -from src.utils.text_loggers import get_logger +from anemic.utils.file_loaders import load_json, save_json +from anemic.utils.mapper import ConfigMapper +from anemic.utils.text_loggers import get_logger logger = get_logger(__name__) diff --git a/src/modules/losses.py b/anemic/modules/losses.py similarity index 96% rename from src/modules/losses.py rename to anemic/modules/losses.py index 8afceaa..0323c72 100755 --- a/src/modules/losses.py +++ b/anemic/modules/losses.py @@ -7,8 +7,8 @@ from torch.autograd import Variable from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss -from src.utils.file_loaders import load_json -from src.utils.mapper import ConfigMapper +from anemic.utils.file_loaders import load_json +from anemic.utils.mapper import ConfigMapper ConfigMapper.map("losses", "mse")(MSELoss) ConfigMapper.map("losses", "CrossEntropyLoss")(CrossEntropyLoss) diff --git a/src/modules/metrics.py b/anemic/modules/metrics.py similarity index 97% rename from src/modules/metrics.py rename to anemic/modules/metrics.py index cdf1a41..f20908a 100755 --- a/src/modules/metrics.py +++ b/anemic/modules/metrics.py @@ -10,9 +10,9 @@ roc_auc_score, ) -from src.utils.configuration import Config -from src.utils.mapper import ConfigMapper -from src.utils.text_loggers import get_logger +from anemic.utils.configuration import Config +from anemic.utils.mapper import ConfigMapper +from anemic.utils.text_loggers import get_logger logger = get_logger(__name__) @@ -23,6 +23,7 @@ def to_np_array(array): array = np.array(array) return array + def _auc_job(x): return roc_auc_score(x[0], x[1]) diff --git a/src/modules/optimizers.py b/anemic/modules/optimizers.py similarity index 80% rename from src/modules/optimizers.py rename to anemic/modules/optimizers.py index 55edccf..be8106f 100755 --- a/src/modules/optimizers.py +++ b/anemic/modules/optimizers.py @@ -1,7 +1,7 @@ """Method containing activation functions""" from torch.optim import SGD, Adam, AdamW -from src.utils.mapper import ConfigMapper +from anemic.utils.mapper import ConfigMapper ConfigMapper.map("optimizers", "adam")(Adam) ConfigMapper.map("optimizers", "adam_w")(AdamW) diff --git a/src/modules/preprocessing_pipelines.py b/anemic/modules/preprocessing_pipelines.py similarity index 96% rename from src/modules/preprocessing_pipelines.py rename to anemic/modules/preprocessing_pipelines.py index d7f2bd4..9bbcd00 100644 --- a/src/modules/preprocessing_pipelines.py +++ b/anemic/modules/preprocessing_pipelines.py @@ -5,14 +5,11 @@ import pandas as pd from tqdm.auto import tqdm -from src.modules.dataset_splitters import * -from src.modules.embeddings import * -from src.modules.preprocessors import ClinicalNotePreprocessor, CodeProcessor -from src.modules.tokenizers import * -from src.utils.code_based_filtering import TopKCodes -from src.utils.file_loaders import load_csv_as_df, save_df, save_json -from src.utils.mapper import ConfigMapper -from src.utils.text_loggers import get_logger +from anemic.modules.preprocessors import ClinicalNotePreprocessor, CodeProcessor +from anemic.utils.code_based_filtering import TopKCodes +from anemic.utils.file_loaders import load_csv_as_df, save_df, save_json +from anemic.utils.mapper import ConfigMapper +from anemic.utils.text_loggers import get_logger logger = get_logger(__name__) tqdm.pandas() diff --git a/src/modules/preprocessors.py b/anemic/modules/preprocessors.py similarity index 98% rename from src/modules/preprocessors.py rename to anemic/modules/preprocessors.py index 1dc8782..ae9dba0 100755 --- a/src/modules/preprocessors.py +++ b/anemic/modules/preprocessors.py @@ -13,8 +13,8 @@ ) from nltk.tokenize import RegexpTokenizer -from src.utils.file_loaders import load_json -from src.utils.text_loggers import get_logger +from anemic.utils.file_loaders import load_json +from anemic.utils.text_loggers import get_logger logger = get_logger(__name__) diff --git a/src/modules/schedulers.py b/anemic/modules/schedulers.py similarity index 90% rename from src/modules/schedulers.py rename to anemic/modules/schedulers.py index b8a07f8..ac6e6d9 100755 --- a/src/modules/schedulers.py +++ b/anemic/modules/schedulers.py @@ -8,7 +8,7 @@ ) from transformers import get_linear_schedule_with_warmup -from src.utils.mapper import ConfigMapper +from anemic.utils.mapper import ConfigMapper ConfigMapper.map("schedulers", "step")(StepLR) ConfigMapper.map("schedulers", "cosineanneal")(CosineAnnealingLR) diff --git a/src/modules/tokenizers.py b/anemic/modules/tokenizers.py similarity index 89% rename from src/modules/tokenizers.py rename to anemic/modules/tokenizers.py index 673093a..50d8146 100755 --- a/src/modules/tokenizers.py +++ b/anemic/modules/tokenizers.py @@ -2,8 +2,8 @@ import logging import sys -from src.utils.mapper import ConfigMapper -from src.utils.text_loggers import get_logger +from anemic.utils.mapper import ConfigMapper +from anemic.utils.text_loggers import get_logger logger = get_logger(__name__) diff --git a/anemic/trainers/__init__.py b/anemic/trainers/__init__.py new file mode 100644 index 0000000..2364313 --- /dev/null +++ b/anemic/trainers/__init__.py @@ -0,0 +1 @@ +from anemic.trainers import base_trainer diff --git a/src/trainers/base_trainer.py b/anemic/trainers/base_trainer.py similarity index 95% rename from src/trainers/base_trainer.py rename to anemic/trainers/base_trainer.py index 748121b..ca81606 100755 --- a/src/trainers/base_trainer.py +++ b/anemic/trainers/base_trainer.py @@ -7,19 +7,11 @@ from torch.utils.data import DataLoader from tqdm import tqdm -from src.modules.embeddings import * -from src.modules.losses import * -from src.modules.metrics import * -from src.modules.optimizers import * -from src.modules.schedulers import * -from src.modules.tokenizers import * -from src.utils.checkpoint_savers import * -from src.utils.configuration import Config -from src.utils.file_loaders import save_json -from src.utils.graph_writers import * -from src.utils.mapper import ConfigMapper -from src.utils.misc import * -from src.utils.text_loggers import get_logger +from anemic.modules.metrics import load_metric +from anemic.utils.configuration import Config +from anemic.utils.file_loaders import save_json +from anemic.utils.mapper import ConfigMapper +from anemic.utils.text_loggers import get_logger logger = get_logger(__name__) @@ -85,17 +77,20 @@ def train(self, model, train_dataset, val_dataset=None): ) scheduler = None if self.config.lr_scheduler is not None: - if 'warmup' in self.config.lr_scheduler.name: - warm_up_steps = self.config.lr_scheduler.params.warm_up_proportion*(len(train_dataset) // batch_size) - num_training_steps = (len(train_dataset) // batch_size) + if "warmup" in self.config.lr_scheduler.name: + warm_up_steps = ( + self.config.lr_scheduler.params.warm_up_proportion + * (len(train_dataset) // batch_size) + ) + num_training_steps = len(train_dataset) // batch_size scheduler = ConfigMapper.get_object( "schedulers", self.config.lr_scheduler.name - )(optimizer,warm_up_steps,num_training_steps) + )(optimizer, warm_up_steps, num_training_steps) logger.debug( f"Created scheduler {scheduler.__class__.__name__} with " f"config: {self.config.lr_scheduler.params}" ) - else : + else: scheduler = ConfigMapper.get_object( "schedulers", self.config.lr_scheduler.name )(optimizer, **self.config.lr_scheduler.params.as_dict()) diff --git a/anemic/utils/__init__.py b/anemic/utils/__init__.py new file mode 100755 index 0000000..e796567 --- /dev/null +++ b/anemic/utils/__init__.py @@ -0,0 +1,12 @@ +from anemic.utils import ( + checkpoint_savers, + code_based_filtering, + configuration, + file_loaders, + graph_writers, + import_related_ops, + mapper, + misc, + model_utils, + text_loggers, +) diff --git a/src/utils/checkpoint_savers.py b/anemic/utils/checkpoint_savers.py similarity index 96% rename from src/utils/checkpoint_savers.py rename to anemic/utils/checkpoint_savers.py index deaabfa..f17d96d 100644 --- a/src/utils/checkpoint_savers.py +++ b/anemic/utils/checkpoint_savers.py @@ -6,10 +6,10 @@ import torch -from src.modules.metrics import load_metric -from src.utils.file_loaders import load_json, save_json -from src.utils.mapper import ConfigMapper -from src.utils.text_loggers import get_logger +from anemic.modules.metrics import load_metric +from anemic.utils.file_loaders import load_json, save_json +from anemic.utils.mapper import ConfigMapper +from anemic.utils.text_loggers import get_logger logger = get_logger(__name__) @@ -186,7 +186,7 @@ def save_ckpt( def load_ckpt(self, model, ckpt_fname, optimizer=None): ckpt_fpath = os.path.join(self.config.checkpoint_dir, ckpt_fname) logger.debug(f"Loading ckpt from {ckpt_fpath}") - checkpoint = torch.load(ckpt_fpath, map_location='cpu') + checkpoint = torch.load(ckpt_fpath, map_location="cpu") model.load_state_dict(checkpoint["model"]) if optimizer: optimizer.load_state_dict(checkpoint["optimizer"]) diff --git a/src/utils/code_based_filtering.py b/anemic/utils/code_based_filtering.py similarity index 95% rename from src/utils/code_based_filtering.py rename to anemic/utils/code_based_filtering.py index 1d845c4..eb2bff8 100644 --- a/src/utils/code_based_filtering.py +++ b/anemic/utils/code_based_filtering.py @@ -5,8 +5,8 @@ import pandas as pd from sklearn.preprocessing import MultiLabelBinarizer -from src.utils.file_loaders import save_json -from src.utils.text_loggers import get_logger +from anemic.utils.file_loaders import save_json +from anemic.utils.text_loggers import get_logger logger = get_logger(__name__) diff --git a/src/utils/configuration.py b/anemic/utils/configuration.py similarity index 95% rename from src/utils/configuration.py rename to anemic/utils/configuration.py index 11e6ccc..eccee07 100755 --- a/src/utils/configuration.py +++ b/anemic/utils/configuration.py @@ -2,7 +2,7 @@ import yaml -from src.utils.mapper import ConfigMapper +from anemic.utils.mapper import ConfigMapper def load_yaml(path): @@ -122,8 +122,10 @@ def __getattr__(self, attr): if isinstance(self._config[attr], dict): return Config(dic=self._config[attr]) elif isinstance(self._config[attr], list): - return [Config(dic=e) if isinstance(e, dict) else e - for e in self._config[attr]] + return [ + Config(dic=e) if isinstance(e, dict) else e + for e in self._config[attr] + ] else: return self._config[attr] else: diff --git a/src/utils/file_loaders.py b/anemic/utils/file_loaders.py similarity index 95% rename from src/utils/file_loaders.py rename to anemic/utils/file_loaders.py index 7f48210..38a1f6b 100644 --- a/src/utils/file_loaders.py +++ b/anemic/utils/file_loaders.py @@ -4,7 +4,7 @@ import pandas as pd -from src.utils.text_loggers import get_logger +from anemic.utils.text_loggers import get_logger logger = get_logger(__name__) diff --git a/src/utils/graph_writers.py b/anemic/utils/graph_writers.py similarity index 89% rename from src/utils/graph_writers.py rename to anemic/utils/graph_writers.py index 840ad44..06d08e2 100755 --- a/src/utils/graph_writers.py +++ b/anemic/utils/graph_writers.py @@ -4,8 +4,8 @@ import torch from torch.utils.tensorboard import SummaryWriter -from src.utils.mapper import ConfigMapper -from src.utils.text_loggers import get_logger +from anemic.utils.mapper import ConfigMapper +from anemic.utils.text_loggers import get_logger logger = get_logger(__name__) diff --git a/src/utils/import_related_ops.py b/anemic/utils/import_related_ops.py similarity index 100% rename from src/utils/import_related_ops.py rename to anemic/utils/import_related_ops.py diff --git a/src/utils/mapper.py b/anemic/utils/mapper.py similarity index 100% rename from src/utils/mapper.py rename to anemic/utils/mapper.py diff --git a/src/utils/misc.py b/anemic/utils/misc.py similarity index 96% rename from src/utils/misc.py rename to anemic/utils/misc.py index 03f76d4..ea182c6 100755 --- a/src/utils/misc.py +++ b/anemic/utils/misc.py @@ -210,4 +210,3 @@ def html_word_importance(words, importances): tags.append("") return "".join(tags) - diff --git a/src/utils/model_utils.py b/anemic/utils/model_utils.py similarity index 97% rename from src/utils/model_utils.py rename to anemic/utils/model_utils.py index ef672b1..74f86ab 100644 --- a/src/utils/model_utils.py +++ b/anemic/utils/model_utils.py @@ -9,9 +9,9 @@ import numpy as np -from src.modules.preprocessors import CodeProcessor -from src.utils.file_loaders import load_csv_as_df, load_json -from src.utils.mapper import ConfigMapper +from anemic.modules.preprocessors import CodeProcessor +from anemic.utils.file_loaders import load_csv_as_df, load_json +from anemic.utils.mapper import ConfigMapper def load_lookups( diff --git a/src/utils/text_loggers.py b/anemic/utils/text_loggers.py similarity index 100% rename from src/utils/text_loggers.py rename to anemic/utils/text_loggers.py diff --git a/app.py b/app.py index 5687010..db1d2a5 100755 --- a/app.py +++ b/app.py @@ -6,21 +6,17 @@ import copy import csv -from captum.attr import LayerIntegratedGradients import numpy as np import pandas as pd import seaborn as sns import streamlit as st import torch +from captum.attr import LayerIntegratedGradients -from src.datasets import * -from src.models import * -from src.modules.embeddings import * -from src.modules.preprocessors import ClinicalNotePreprocessor -from src.utils.checkpoint_savers import * -from src.utils.configuration import Config -from src.utils.mapper import ConfigMapper -from src.utils.misc import html_word_importance +from anemic.modules.preprocessors import ClinicalNotePreprocessor +from anemic.utils.configuration import Config +from anemic.utils.mapper import ConfigMapper +from anemic.utils.misc import html_word_importance hash_funcs = { Config: lambda x: hash(str(x)), @@ -91,7 +87,7 @@ def load_modules(config): # 3. Load model model_dict = {} lig_dict = {} - if hasattr(config, 'models'): + if hasattr(config, "models"): model_configs = config.models else: model_configs = [config] @@ -121,8 +117,10 @@ def load_modules(config): ) embed_layer = getattr(model, embed_layer_name) except: - raise ValueError(f"Config for {model_config.model.name} does not" - "specify name of the embedding layer.") + raise ValueError( + f"Config for {model_config.model.name} does not" + "specify name of the embedding layer." + ) lig = LayerIntegratedGradients(model, embed_layer) model_dict[model_config.model.name] = model @@ -138,6 +136,7 @@ def load_icd_desc(config): icd_desc = {r[0]: r[1] for r in icd_desc} return icd_desc + # Page setup st.set_page_config( page_title="ICD Coding Interactive Demo", @@ -145,7 +144,8 @@ def load_icd_desc(config): layout="wide", ) -st.markdown(""" +st.markdown( + """ """, unsafe_allow_html=True) +""", + unsafe_allow_html=True, +) # Title & status line st.title("🩺 ICD Coding Interactive Demo") @@ -223,15 +225,17 @@ def set_status(text): "NO", "Integrated Gradients", ] - if any(hasattr(model, "get_input_attention") for model in - model_dict.values()): + if any( + hasattr(model, "get_input_attention") + for model in model_dict.values() + ): vis_score_options.append("Attention score") vis_score = st.radio( "Visualize attribution score", vis_score_options, help="""Interpretability visualization methods. Attention score is - available only for attention-based models.""" + available only for attention-based models.""", ) vis_code_options = ["Choose ICD code"] @@ -245,8 +249,10 @@ def set_status(text): ) # Preprocessing option selection (truncation is not controlled) - st.markdown("""
Preprocessing
""", - unsafe_allow_html=True) + st.markdown( + """Preprocessing
""", + unsafe_allow_html=True, + ) pp_config = config.clinical_note_preprocessing pp_lower_case = st.checkbox( "Lowercase", @@ -273,8 +279,10 @@ def set_status(text): with col2: # Input text css_str = "line-height:1; margin-top:1rem; margin-bottom:-2rem;" - st.markdown(f"""