diff --git a/.gitignore b/.gitignore
index 15f1423..2c32756 100644
--- a/.gitignore
+++ b/.gitignore
@@ -130,6 +130,7 @@ dmypy.json
 
 # Results/Logs
 results/*
+results*/*
 logs/*.log
 
 # toml files
diff --git a/anemic/__init__.py b/anemic/__init__.py
new file mode 100644
index 0000000..cac0ba3
--- /dev/null
+++ b/anemic/__init__.py
@@ -0,0 +1,7 @@
+from anemic import (
+    datasets,
+    models,
+    modules,
+    trainers,
+    utils,
+)
diff --git a/anemic/datasets/__init__.py b/anemic/datasets/__init__.py
new file mode 100644
index 0000000..047b697
--- /dev/null
+++ b/anemic/datasets/__init__.py
@@ -0,0 +1 @@
+from anemic.datasets import base_dataset
diff --git a/src/datasets/base_dataset.py b/anemic/datasets/base_dataset.py
similarity index 95%
rename from src/datasets/base_dataset.py
rename to anemic/datasets/base_dataset.py
index 912828c..43eda14 100644
--- a/src/datasets/base_dataset.py
+++ b/anemic/datasets/base_dataset.py
@@ -5,9 +5,9 @@
 import torch
 from torch.utils.data import Dataset
 
-from src.utils.file_loaders import load_csv_as_df, load_json
-from src.utils.mapper import ConfigMapper
-from src.utils.text_loggers import get_logger
+from anemic.utils.file_loaders import load_csv_as_df, load_json
+from anemic.utils.mapper import ConfigMapper
+from anemic.utils.text_loggers import get_logger
 
 logger = get_logger(__name__)
 
diff --git a/anemic/models/__init__.py b/anemic/models/__init__.py
new file mode 100644
index 0000000..2457c48
--- /dev/null
+++ b/anemic/models/__init__.py
@@ -0,0 +1,8 @@
+from anemic.models import (
+    auto_models,
+    caml,
+    dcan,
+    fusion,
+    multirescnn,
+    transicd,
+)
diff --git a/anemic/models/auto_models.py b/anemic/models/auto_models.py
new file mode 100644
index 0000000..ec38066
--- /dev/null
+++ b/anemic/models/auto_models.py
@@ -0,0 +1 @@
+from anemic.utils.mapper import ConfigMapper
diff --git a/src/models/caml.py b/anemic/models/caml.py
similarity index 98%
rename from src/models/caml.py
rename to anemic/models/caml.py
index 8828292..73c79c6 100644
--- a/src/models/caml.py
+++ b/anemic/models/caml.py
@@ -12,9 +12,9 @@
 from torch.autograd import Variable
 from torch.nn.init import xavier_uniform
 
-from src.utils.model_utils import load_lookups, pad_desc_vecs
-from src.utils.mapper import ConfigMapper
-from src.utils.text_loggers import get_logger
+from anemic.utils.mapper import ConfigMapper
+from anemic.utils.model_utils import load_lookups, pad_desc_vecs
+from anemic.utils.text_loggers import get_logger
 
 logger = get_logger(__name__)
 
diff --git a/anemic/models/dcan.py b/anemic/models/dcan.py
new file mode 100644
index 0000000..f46c6ba
--- /dev/null
+++ b/anemic/models/dcan.py
@@ -0,0 +1,507 @@
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_
+from torch.nn.utils import weight_norm as weight_norm_
+
+from anemic.utils.mapper import ConfigMapper
+from anemic.utils.text_loggers import get_logger
+
+logger = get_logger(__name__)
+
+
+@ConfigMapper.map("models", "dcan")
+class DCAN(nn.Module):
+    """
+    This class is used to create the DCAN model.
+    References:
+        Paper: https://aclanthology.org/2020.clinicalnlp-1.8/
+        GitHub Repository: https://github.com/shaoxiongji/DCAN
+    For the parameters related to convolutional layers, please see this:
+    https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html.
+    Args:
+        num_classes (int): Number of classes (ICD codes).
+        conv_channel_sizes (list): List of lists of integers. Each list
+                                   represents the channel sizes of convolutional
+                                   layers in a `TemporalBlock`. So, for example,
+                                   if the list is [[100, 600, 600],
+                                                   [600, 600, 600]].
+                                   the `TemporalConvNet` layer will have 2
+                                   `TemporalBlock`s, each temporal block have
+                                   2 convolutional layers:
+                                   Conv(100, 600), Conv(600, 600) for the first
+                                   one, and Conv(600, 600), Conv(600, 600). If
+                                   the `add_emb_size_to_channel_sizes`, we don't
+                                   have to pass the input channel size. So, in
+                                   the above case, we can just pass
+                                   [[600, 600], [600, 600, 600]].
+        add_emb_size_to_channel_sizes (bool): If True, you need not specify
+                                              the input channel size. Please
+                                              see the description of
+                                              `conv_channel_sizes`.
+        kernel_sizes (list): List of list of integers (same format as
+                             `conv_channel_sizes`). Each integer represents the
+                             kernel size/filter size of the respective
+                             convolutional layer in `TemporalBlock` layer.
+        strides (list): List of list of integers (same format as
+                        `conv_channel_sizes`). Each integer represents the
+                        stride of the respective convolutional layer in
+                        `TemporalBlock` layer.
+        paddings (list): List of list of integers (same format as
+                         `conv_channel_sizes`). Each integer represents the
+                         padding of the respective convolutional layer in
+                         `TemporalBlock` layer. in DCAN, this value is set to
+                         "(kernel_size - 1) * dilation_size".
+        dilations (list): List of list of integers (same format as
+                          `conv_channel_sizes`). Each integer represents the
+                          dilation size of the respective convolutional layer
+                          `TemporalBlock` layer.` In DCAN, this value is
+                          "2^(temporal_block_level)".
+        dropouts (list): List of list of floats (same format as
+                         `conv_channel_sizes`). Each float represents the
+                         dropout probability of the respective convolutional
+                         `TemporalBlock` layer.
+        weight_norm (bool): If True, apply weight normalization to the
+                            convolutional layers.
+        activation (str): Activation function to use. Should be one of "relu",
+                          "elu", "leaky_relu".
+    """
+
+    def __init__(self, config):
+        super(DCAN, self).__init__()
+        logger.info(f"Initialising {self.__class__.__name__}")
+        logger.debug(
+            f"Initialising {self.__class__.__name__} with " f"config: {config}"
+        )
+
+        self.config = config
+
+        self.word_embedding_layer = WordEmbeddingLayer(
+            **config.word_representation_layer.params.init_params.as_dict()
+        )
+        if config.word_representation_layer.params.freeze_layer:
+            self.freeze_layer(self.word_embedding_layer.embed)
+
+        num_levels = len(config.kernel_sizes)
+        num_inner_conv_levels = len(config.kernel_sizes[0])
+
+        conv_channel_sizes = copy.deepcopy(config.conv_channel_sizes)
+        if config.add_emb_size_to_channel_sizes:
+            conv_channel_sizes[0] = [
+                self.word_embedding_layer.embedding_size
+            ] + conv_channel_sizes[0]
+        dropouts = [
+            [config.dropout for _ in range(num_inner_conv_levels)]
+            for _ in range(num_levels)
+        ]
+
+        self.temporal_conv_net = TemporalConvNet(
+            conv_channel_sizes_=conv_channel_sizes,
+            kernel_sizes_=config.kernel_sizes,
+            strides_=config.strides,
+            paddings_=config.paddings,
+            dilations_=config.dilations,
+            dropouts_=dropouts,
+            weight_norm=config.weight_norm,
+            activation=config.activation,
+        )
+
+        self.linear_layer = nn.Linear(
+            conv_channel_sizes[-1][-1], config.projection_size
+        )
+        self.activation = ConfigMapper.get_object(
+            "activations", config.activation
+        )()
+
+        self.output_layer = OutputLayer(
+            config.projection_size, config.num_classes
+        )
+
+        xavier_uniform_(self.linear_layer.weight)
+
+    def forward(self, data):
+        x = self.word_embedding_layer(data)
+        hid_seq = self.temporal_conv_net(x.transpose(1, 2)).transpose(1, 2)
+        hid_seq = self.activation(self.linear_layer(hid_seq))
+        logits = self.output_layer(hid_seq)
+        return logits
+
+    def freeze_layer(self, layer):
+        for param in layer.parameters():
+            param.requires_grad = False
+
+    def get_input_attention(self):
+        # Use the attention score computed in the forward pass
+        return self.output_layer.label_wise_attn.alpha.cpu().detach().numpy()
+
+
+class OutputLayer(nn.Module):
+    def __init__(self, input_size, num_classes):
+        super(OutputLayer, self).__init__()
+        self.label_wise_attn = LabelWiseAttn(input_size, num_classes)
+
+        self.final = nn.Linear(input_size, num_classes)
+        xavier_uniform_(self.final.weight)
+
+    def forward(self, x):
+        m = self.label_wise_attn(x)
+        logits = self.final.weight.mul(m).sum(dim=2).add(self.final.bias)
+        return logits
+
+
+class WordEmbeddingLayer(nn.Module):
+    """
+    A Word Embedding Layer. This layer loads a pre-trained word embedding matrix
+    , and copies its weights to an nn.Embedding layer.
+
+    Args:
+        embed_dir (str): A directory containing the pre-trained word embedding
+                         matrix, among other things. Please see
+                         https://github.com/dalgu90/icd-coding-benchmark/blob/main/anemic/modules/embeddings.py#L17
+                         for more details.
+        dropout (float): The dropout probability.
+    """
+
+    def __init__(self, embed_dir, dropout):
+        super(WordEmbeddingLayer, self).__init__()
+        logger.debug(
+            f"Initialising {self.__class__.__name__} with "
+            f"embed_dir = {embed_dir}, dropout = {dropout}"
+        )
+
+        # Note: This should be changed, since we won't always use Word2Vec.
+        embedding_cls = ConfigMapper.get_object("embeddings", "word2vec")
+
+        W = torch.Tensor(embedding_cls.load_emb_matrix(embed_dir))
+        self.embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0)
+        self.embed.weight.data = W.clone()
+
+        self.embedding_size = self.embed.embedding_dim
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        embedding = self.embed(x)
+        x = self.dropout(embedding)
+        return x
+
+
+class Chomp1d(nn.Module):
+    def __init__(self, chomp_size):
+        super(Chomp1d, self).__init__()
+        self.chomp_size = chomp_size
+
+    def forward(self, x):
+        return x[:, :, : -self.chomp_size].contiguous()
+
+
+class ConvTemporalSubBlock(nn.Module):
+    """
+    A simple temporal convolutional block. Adapted from
+    https://github.com/shaoxiongji/DCAN/blob/master/models.py#L84-L88. This
+    layer has a dilated convolutional layer, a `chomp1d` layer, followed by
+    activation and dropout. For the parameters related to convolutional layers,
+    please see this:
+    https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html.
+
+    Args:
+        in_channels (int): The number of input channels in the convolutional
+                           layer.
+        out_channels (int): The number of output channels in the convolutional
+                            layer.
+        kernel_size (int): The size of the kernel in the convolutional layer.
+        stride (int): The stride of the convolutional layer.
+        padding (int): The padding of the convolutional layer.
+        dilation (int): The dilation size of the convolutional layer.
+        dropout (float): The dropout probability.
+        weight_norm (bool): Whether to apply weight normalization to the
+                            convolutional layer.
+        activation (str): The activation function to use. DCAN uses "relu".
+                          For all available activations, see
+                          https://github.com/dalgu90/icd-coding-benchmark/blob/main/anemic/modules/activations.py.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        dropout=0.2,
+        weight_norm=True,
+        activation="relu",
+    ):
+        super(ConvTemporalSubBlock, self).__init__()
+        logger.debug(
+            f"Initialising {self.__class__.__name__} with "
+            f"in_channels = {in_channels}, out_channels = "
+            f"{out_channels}, kernel_size = {kernel_size}, "
+            f"stride = {stride}, padding = {padding}, "
+            f"dilation = {dilation}, dropout = {dropout}, "
+            f"weight_norm = {weight_norm}, activation = {activation}"
+        )
+
+        self.conv_layer = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+        )
+        if weight_norm:
+            self.conv_layer = weight_norm_(self.conv_layer)
+        self.chomp1d = Chomp1d(padding)
+        self.activation = ConfigMapper.get_object("activations", activation)()
+        self.dropout = nn.Dropout(dropout)
+
+        self.__init_weights__()
+
+    def __init_weights__(self):
+        xavier_uniform_(self.conv_layer.weight)
+
+    def forward(self, x):
+        x = self.conv_layer(x)
+        x = self.chomp1d(x)
+        x = self.activation(x)
+        x = self.dropout(x)
+        return x
+
+
+class TemporalBlock(nn.Module):
+    """
+    A Temporal Block containing stacks of `ConvTemporalSubBlocks`, followed
+    by activation.
+    References:
+        Paper: https://arxiv.org/abs/2009.14578
+        Repository: https://github.com/shaoxiongji/DCAN/blob/master/models.py#L81
+
+    Args:
+        conv_channel_sizes (list): List of integers, with channel sizes of
+                                   convolutional layers. For example, if the
+                                   list is [100, 200, 300], there will be two
+                                   convolutional layers: Conv1d(100, 200) and
+                                   Conv1d(200, 300).
+        kernel_sizes (list): List of integers, with kernel sizes of every
+                             `ConvTemporalSubBlock`.
+        strides (list): List of integers, with strides of convolutional layers.
+        paddings (list): List of integers, with paddings of every
+                         `ConvTemporalSubBlock`.
+        dilations (list): List of integers, with dilation sizes of every
+                          `ConvTemporalSubBlock`.
+        dropouts (list): List of floats, with dropout probabilities of every
+                         `ConvTemporalSubBlock`.
+        weight_norm (bool): Whether to apply weight normalization to every
+                             convolutional layer. DCAN uses weight norm.
+        activation (str): The activation function to use. DCAN uses "relu".
+    """
+
+    def __init__(
+        self,
+        conv_channel_sizes,
+        kernel_sizes,
+        strides,
+        paddings,
+        dilations,
+        dropouts,
+        weight_norm=True,
+        activation="relu",
+    ):
+        super(TemporalBlock, self).__init__()
+        conv_channel_size_pairs = list(
+            zip(conv_channel_sizes[:-1], conv_channel_sizes[1:])
+        )
+
+        self.conv_temporal_sub_blocks = nn.ModuleList(
+            [
+                ConvTemporalSubBlock(
+                    in_channels=conv_channel_size_pair[0],
+                    out_channels=conv_channel_size_pair[1],
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    dilation=dilation,
+                    dropout=dropout,
+                    weight_norm=weight_norm,
+                    activation=activation,
+                )
+                for (
+                    conv_channel_size_pair,
+                    kernel_size,
+                    stride,
+                    padding,
+                    dilation,
+                    dropout,
+                ) in zip(
+                    conv_channel_size_pairs,
+                    kernel_sizes,
+                    strides,
+                    paddings,
+                    dilations,
+                    dropouts,
+                )
+            ]
+        )
+
+        self.downsample = (
+            nn.Conv1d(conv_channel_sizes[0], conv_channel_sizes[-1], 1)
+            if conv_channel_sizes[0] != conv_channel_sizes[-1]
+            else None
+        )
+        self.output_activation = ConfigMapper.get_object(
+            "activations", activation
+        )()
+
+        self.init_weights()
+
+    def init_weights(self):
+        if self.downsample is not None:
+            xavier_uniform_(self.downsample.weight)
+
+    def forward(self, x):
+        conv_layer_output = x
+        for conv_temporal_sub_block in self.conv_temporal_sub_blocks:
+            conv_layer_output = conv_temporal_sub_block(conv_layer_output)
+        res = x if self.downsample is None else self.downsample(x)
+        return self.output_activation(conv_layer_output + res)
+
+
+class TemporalConvNet(nn.Module):
+    """
+    Stack of `TemporalBlock`s. Used in the DCAN model.
+    References:
+        Paper: https://arxiv.org/abs/2009.14578
+        Repository: https://github.com/shaoxiongji/DCAN/blob/master/models.py#L114
+
+    Args:
+        conv_channel_sizes_ (list): List of lists of integers. Each list
+                                    represents the channel sizes of convolutional
+                                    layers in a `TemporalBlock`. So, for
+                                    example, if the list is [[100, 600, 600],
+                                                             [600, 600, 600]].
+                                    the `TemporalConvNet` layer will have 2
+                                    `TemporalBlock`s, each temporal block have
+                                    2 convolutional layers:
+                                    Conv(100, 600), Conv(600, 600) for the first
+                                    one, and Conv(600, 600), Conv(600, 600). If
+                                    the `add_emb_size_to_channel_sizes`, we
+                                    don't have to pass the input channel size.
+                                    So, in the above case, we can just pass
+                                    [[600, 600], [600, 600, 600]].
+        kernel_sizes_ (list): List of list of integers (same format as
+                              `conv_channel_sizes`). Each integer represents the
+                              kernel size/filter size of the respective
+                              convolutional layer in `TemporalBlock` layer.
+        strides_ (list): List of list of integers (same format as
+                         `conv_channel_sizes`). Each integer represents the
+                         stride of the respective convolutional layer in
+                         `TemporalBlock` layer.
+        paddings_ (list): List of list of integers (same format as
+                          `conv_channel_sizes`). Each integer represents the
+                          padding of the respective convolutional layer in
+                          `TemporalBlock` layer. in DCAN, this value is set to
+                          "(kernel_size - 1) * dilation_size".
+        dilations_ (list): List of list of integers (same format as
+                           `conv_channel_sizes`). Each integer represents the
+                           dilation size of the respective convolutional layer
+                           `TemporalBlock` layer.` In DCAN, this value is
+                           "2^(temporal_block_level)".
+        dropouts_ (list): List of list of floats (same format as
+                          `conv_channel_sizes`). Each float represents the
+                          dropout probability of the respective convolutional
+                          `TemporalBlock` layer.
+        weight_norm (bool): If True, apply weight normalization to the
+                            convolutional layers.
+        activation (str): Activation function to use. DCAN uses "relu".
+    """
+
+    def __init__(
+        self,
+        conv_channel_sizes_,
+        kernel_sizes_,
+        strides_,
+        paddings_,
+        dilations_,
+        dropouts_,
+        weight_norm=True,
+        activation="relu",
+    ):
+        super(TemporalConvNet, self).__init__()
+        logger.debug(
+            f"Initialising {self.__class__.__name__} with "
+            f"conv_channel_sizes_ = {conv_channel_sizes_}, "
+            f"kernel_sizes_ = {kernel_sizes_}, "
+            f"strides_ = {strides_}, paddings_ = {paddings_}, "
+            f"dilations_ = {dilations_}, dropouts_ = {dropouts_}, "
+            f"weight_norm = {weight_norm}, activation = {activation}"
+        )
+
+        self.temporal_blocks = nn.ModuleList(
+            [
+                TemporalBlock(
+                    conv_channel_sizes=conv_channel_sizes,
+                    kernel_sizes=kernel_sizes,
+                    strides=strides,
+                    paddings=paddings,
+                    dilations=dilations,
+                    dropouts=dropouts,
+                    weight_norm=weight_norm,
+                    activation=activation,
+                )
+                for (
+                    conv_channel_sizes,
+                    kernel_sizes,
+                    strides,
+                    paddings,
+                    dilations,
+                    dropouts,
+                ) in zip(
+                    conv_channel_sizes_,
+                    kernel_sizes_,
+                    strides_,
+                    paddings_,
+                    dilations_,
+                    dropouts_,
+                )
+            ]
+        )
+
+    def forward(self, x):
+        for temporal_block in self.temporal_blocks:
+            x = temporal_block(x)
+        return x
+
+
+class LabelWiseAttn(nn.Module):
+    """
+    A Label-wise Attention layer (as implemented in CAML, DCAN, etc.).
+    References:
+        Papers: https://arxiv.org/abs/1802.05695 (Section 2.2)
+        Repository: https://github.com/jamesmullenbach/caml-mimic/blob/master/learn/models.py#L184
+
+    Args:
+        input_size (int): The size of the input, i.e., the number of channels
+                          if the output is from a convolutional layer/embedding
+                          size if the output is from a fully connected layer.
+        num_classes (int): The number of classes.
+    """
+
+    def __init__(self, input_size, num_classes):
+        super(LabelWiseAttn, self).__init__()
+        logger.debug(
+            f"Initialising {self.__class__.__name__} with "
+            f"input size = {input_size}, num_classes = {num_classes}"
+        )
+
+        self.U = nn.Linear(input_size, num_classes)
+        xavier_uniform_(self.U.weight)
+
+    def forward(self, x):
+        att = self.U.weight.matmul(x.transpose(1, 2))  # [bs, Y, seq_len]
+        self.alpha = F.softmax(att, dim=2)
+        m = self.alpha.matmul(x)  # [bs, Y, dim]
+        return m
diff --git a/src/models/fusion.py b/anemic/models/fusion.py
similarity index 98%
rename from src/models/fusion.py
rename to anemic/models/fusion.py
index e2a5d82..4c154d4 100755
--- a/src/models/fusion.py
+++ b/anemic/models/fusion.py
@@ -11,9 +11,9 @@
 import torch.nn.functional as F
 from torch.nn.init import xavier_uniform_ as xavier_uniform
 
-from src.utils.model_utils import load_lookups
-from src.utils.mapper import ConfigMapper
-from src.utils.text_loggers import get_logger
+from anemic.utils.mapper import ConfigMapper
+from anemic.utils.model_utils import load_lookups
+from anemic.utils.text_loggers import get_logger
 
 logger = get_logger(__name__)
 
@@ -417,7 +417,7 @@ class Fusion(nn.Module):
 
     def __init__(self, config):
         super(Fusion, self).__init__()
-        logger.info(f"Initialising %s", self.__class__.__name__)
+        logger.info("Initialising %s", self.__class__.__name__)
         logger.debug(
             "Initialising %s with config: %s", self.__class__.__name__, config
         )
diff --git a/src/models/multirescnn.py b/anemic/models/multirescnn.py
similarity index 98%
rename from src/models/multirescnn.py
rename to anemic/models/multirescnn.py
index 95b1241..1b6b147 100644
--- a/src/models/multirescnn.py
+++ b/anemic/models/multirescnn.py
@@ -1,5 +1,6 @@
 """
-ICD Coding from Clinical Text Using Multi-Filter Residual Convolutional Neural Network, 2020
+ICD Coding from Clinical Text Using Multi-Filter Residual Convolutional Neural
+Network, 2020
 https://github.com/foxlf823/Multi-Filter-Residual-Convolutional-Neural-Network
 """
 
@@ -10,9 +11,9 @@
 import torch.nn.functional as F
 from torch.nn.init import xavier_uniform_ as xavier_uniform
 
-from src.utils.mapper import ConfigMapper
-from src.utils.model_utils import load_lookups
-from src.utils.text_loggers import get_logger
+from anemic.utils.mapper import ConfigMapper
+from anemic.utils.model_utils import load_lookups
+from anemic.utils.text_loggers import get_logger
 
 logger = get_logger(__name__)
 
diff --git a/src/models/transicd/model.py b/anemic/models/transicd.py
similarity index 96%
rename from src/models/transicd/model.py
rename to anemic/models/transicd.py
index a337a07..5c9b62f 100644
--- a/src/models/transicd/model.py
+++ b/anemic/models/transicd.py
@@ -4,8 +4,8 @@
 import torch.nn as nn
 from torch.autograd import Variable
 
-from src.utils.mapper import ConfigMapper
-from src.utils.text_loggers import get_logger
+from anemic.utils.mapper import ConfigMapper
+from anemic.utils.text_loggers import get_logger
 
 logger = get_logger(__name__)
 
@@ -114,7 +114,7 @@ def forward(self, inputs):
 
         # outputs = torch.zeros(batch_size, self.num_classes).to(inputs.device)
         # for code, ff_layer in enumerate(self.ff_layers):
-            # outputs[:, code : code + 1] = ff_layer(weighted_outputs[:, code, :])
+        #     outputs[:, code : code + 1] = ff_layer(weighted_outputs[:, code])
 
         # Trick: Use one linear layer as per-code linear layers
         outputs = (weighted_outputs * self.ff_layer.weight).sum(axis=2)
@@ -135,7 +135,7 @@ class WordEmbeddingLayer(nn.Module):
     Args:
         embed_dir (str): A directory containing the pre-trained word embedding
                          matrix, among other things. Please see
-                         https://github.com/dalgu90/icd-coding-benchmark/blob/main/src/modules/embeddings.py#L17
+                         https://github.com/dalgu90/icd-coding-benchmark/blob/main/anemic/modules/embeddings.py#L17
                          for more details.
     """
 
@@ -223,9 +223,7 @@ class LabelAttentionLayer(nn.Module):
                               Defaults to 2.
     """
 
-    def __init__(
-        self, embed_size=128, num_classes=50, attn_expansion=2
-    ):
+    def __init__(self, embed_size=128, num_classes=50, attn_expansion=2):
         super(LabelAttentionLayer, self).__init__()
         logger.debug(
             f"Initialising {self.__class__.__name__} with "
diff --git a/anemic/modules/__init__.py b/anemic/modules/__init__.py
new file mode 100755
index 0000000..e16fe6a
--- /dev/null
+++ b/anemic/modules/__init__.py
@@ -0,0 +1,12 @@
+from anemic.modules import (
+    activations,
+    dataset_splitters,
+    embeddings,
+    losses,
+    metrics,
+    optimizers,
+    preprocessing_pipelines,
+    preprocessors,
+    schedulers,
+    tokenizers,
+)
diff --git a/src/modules/activations.py b/anemic/modules/activations.py
similarity index 80%
rename from src/modules/activations.py
rename to anemic/modules/activations.py
index 8433949..fb03ee4 100755
--- a/src/modules/activations.py
+++ b/anemic/modules/activations.py
@@ -1,6 +1,6 @@
 import torch.nn as nn
 
-from src.utils.mapper import ConfigMapper
+from anemic.utils.mapper import ConfigMapper
 
 ConfigMapper.map("activations", "relu")(nn.ReLU)
 ConfigMapper.map("activations", "logsoftmax")(nn.LogSoftmax)
diff --git a/src/modules/dataset_splitters.py b/anemic/modules/dataset_splitters.py
similarity index 87%
rename from src/modules/dataset_splitters.py
rename to anemic/modules/dataset_splitters.py
index 54a9a6c..2ec195f 100644
--- a/src/modules/dataset_splitters.py
+++ b/anemic/modules/dataset_splitters.py
@@ -1,9 +1,9 @@
 import logging
 import os
 
-from src.utils.file_loaders import load_json
-from src.utils.mapper import ConfigMapper
-from src.utils.text_loggers import get_logger
+from anemic.utils.file_loaders import load_json
+from anemic.utils.mapper import ConfigMapper
+from anemic.utils.text_loggers import get_logger
 
 logger = get_logger(__name__)
 
diff --git a/src/modules/embeddings.py b/anemic/modules/embeddings.py
similarity index 93%
rename from src/modules/embeddings.py
rename to anemic/modules/embeddings.py
index 3b31c1e..ce83881 100755
--- a/src/modules/embeddings.py
+++ b/anemic/modules/embeddings.py
@@ -6,9 +6,9 @@
 import gensim
 import numpy as np
 
-from src.utils.file_loaders import load_json, save_json
-from src.utils.mapper import ConfigMapper
-from src.utils.text_loggers import get_logger
+from anemic.utils.file_loaders import load_json, save_json
+from anemic.utils.mapper import ConfigMapper
+from anemic.utils.text_loggers import get_logger
 
 logger = get_logger(__name__)
 
diff --git a/src/modules/losses.py b/anemic/modules/losses.py
similarity index 96%
rename from src/modules/losses.py
rename to anemic/modules/losses.py
index 8afceaa..0323c72 100755
--- a/src/modules/losses.py
+++ b/anemic/modules/losses.py
@@ -7,8 +7,8 @@
 from torch.autograd import Variable
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from src.utils.file_loaders import load_json
-from src.utils.mapper import ConfigMapper
+from anemic.utils.file_loaders import load_json
+from anemic.utils.mapper import ConfigMapper
 
 ConfigMapper.map("losses", "mse")(MSELoss)
 ConfigMapper.map("losses", "CrossEntropyLoss")(CrossEntropyLoss)
diff --git a/src/modules/metrics.py b/anemic/modules/metrics.py
similarity index 97%
rename from src/modules/metrics.py
rename to anemic/modules/metrics.py
index cdf1a41..f20908a 100755
--- a/src/modules/metrics.py
+++ b/anemic/modules/metrics.py
@@ -10,9 +10,9 @@
     roc_auc_score,
 )
 
-from src.utils.configuration import Config
-from src.utils.mapper import ConfigMapper
-from src.utils.text_loggers import get_logger
+from anemic.utils.configuration import Config
+from anemic.utils.mapper import ConfigMapper
+from anemic.utils.text_loggers import get_logger
 
 logger = get_logger(__name__)
 
@@ -23,6 +23,7 @@ def to_np_array(array):
         array = np.array(array)
     return array
 
+
 def _auc_job(x):
     return roc_auc_score(x[0], x[1])
 
diff --git a/src/modules/optimizers.py b/anemic/modules/optimizers.py
similarity index 80%
rename from src/modules/optimizers.py
rename to anemic/modules/optimizers.py
index 55edccf..be8106f 100755
--- a/src/modules/optimizers.py
+++ b/anemic/modules/optimizers.py
@@ -1,7 +1,7 @@
 """Method containing activation functions"""
 from torch.optim import SGD, Adam, AdamW
 
-from src.utils.mapper import ConfigMapper
+from anemic.utils.mapper import ConfigMapper
 
 ConfigMapper.map("optimizers", "adam")(Adam)
 ConfigMapper.map("optimizers", "adam_w")(AdamW)
diff --git a/src/modules/preprocessing_pipelines.py b/anemic/modules/preprocessing_pipelines.py
similarity index 96%
rename from src/modules/preprocessing_pipelines.py
rename to anemic/modules/preprocessing_pipelines.py
index d7f2bd4..9bbcd00 100644
--- a/src/modules/preprocessing_pipelines.py
+++ b/anemic/modules/preprocessing_pipelines.py
@@ -5,14 +5,11 @@
 import pandas as pd
 from tqdm.auto import tqdm
 
-from src.modules.dataset_splitters import *
-from src.modules.embeddings import *
-from src.modules.preprocessors import ClinicalNotePreprocessor, CodeProcessor
-from src.modules.tokenizers import *
-from src.utils.code_based_filtering import TopKCodes
-from src.utils.file_loaders import load_csv_as_df, save_df, save_json
-from src.utils.mapper import ConfigMapper
-from src.utils.text_loggers import get_logger
+from anemic.modules.preprocessors import ClinicalNotePreprocessor, CodeProcessor
+from anemic.utils.code_based_filtering import TopKCodes
+from anemic.utils.file_loaders import load_csv_as_df, save_df, save_json
+from anemic.utils.mapper import ConfigMapper
+from anemic.utils.text_loggers import get_logger
 
 logger = get_logger(__name__)
 tqdm.pandas()
diff --git a/src/modules/preprocessors.py b/anemic/modules/preprocessors.py
similarity index 98%
rename from src/modules/preprocessors.py
rename to anemic/modules/preprocessors.py
index 1dc8782..ae9dba0 100755
--- a/src/modules/preprocessors.py
+++ b/anemic/modules/preprocessors.py
@@ -13,8 +13,8 @@
 )
 from nltk.tokenize import RegexpTokenizer
 
-from src.utils.file_loaders import load_json
-from src.utils.text_loggers import get_logger
+from anemic.utils.file_loaders import load_json
+from anemic.utils.text_loggers import get_logger
 
 logger = get_logger(__name__)
 
diff --git a/src/modules/schedulers.py b/anemic/modules/schedulers.py
similarity index 90%
rename from src/modules/schedulers.py
rename to anemic/modules/schedulers.py
index b8a07f8..ac6e6d9 100755
--- a/src/modules/schedulers.py
+++ b/anemic/modules/schedulers.py
@@ -8,7 +8,7 @@
 )
 from transformers import get_linear_schedule_with_warmup
 
-from src.utils.mapper import ConfigMapper
+from anemic.utils.mapper import ConfigMapper
 
 ConfigMapper.map("schedulers", "step")(StepLR)
 ConfigMapper.map("schedulers", "cosineanneal")(CosineAnnealingLR)
diff --git a/src/modules/tokenizers.py b/anemic/modules/tokenizers.py
similarity index 89%
rename from src/modules/tokenizers.py
rename to anemic/modules/tokenizers.py
index 673093a..50d8146 100755
--- a/src/modules/tokenizers.py
+++ b/anemic/modules/tokenizers.py
@@ -2,8 +2,8 @@
 import logging
 import sys
 
-from src.utils.mapper import ConfigMapper
-from src.utils.text_loggers import get_logger
+from anemic.utils.mapper import ConfigMapper
+from anemic.utils.text_loggers import get_logger
 
 logger = get_logger(__name__)
 
diff --git a/anemic/trainers/__init__.py b/anemic/trainers/__init__.py
new file mode 100644
index 0000000..2364313
--- /dev/null
+++ b/anemic/trainers/__init__.py
@@ -0,0 +1 @@
+from anemic.trainers import base_trainer
diff --git a/src/trainers/base_trainer.py b/anemic/trainers/base_trainer.py
similarity index 95%
rename from src/trainers/base_trainer.py
rename to anemic/trainers/base_trainer.py
index 748121b..ca81606 100755
--- a/src/trainers/base_trainer.py
+++ b/anemic/trainers/base_trainer.py
@@ -7,19 +7,11 @@
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 
-from src.modules.embeddings import *
-from src.modules.losses import *
-from src.modules.metrics import *
-from src.modules.optimizers import *
-from src.modules.schedulers import *
-from src.modules.tokenizers import *
-from src.utils.checkpoint_savers import *
-from src.utils.configuration import Config
-from src.utils.file_loaders import save_json
-from src.utils.graph_writers import *
-from src.utils.mapper import ConfigMapper
-from src.utils.misc import *
-from src.utils.text_loggers import get_logger
+from anemic.modules.metrics import load_metric
+from anemic.utils.configuration import Config
+from anemic.utils.file_loaders import save_json
+from anemic.utils.mapper import ConfigMapper
+from anemic.utils.text_loggers import get_logger
 
 logger = get_logger(__name__)
 
@@ -85,17 +77,20 @@ def train(self, model, train_dataset, val_dataset=None):
         )
         scheduler = None
         if self.config.lr_scheduler is not None:
-            if 'warmup' in self.config.lr_scheduler.name:
-                warm_up_steps = self.config.lr_scheduler.params.warm_up_proportion*(len(train_dataset) // batch_size)
-                num_training_steps = (len(train_dataset) // batch_size)
+            if "warmup" in self.config.lr_scheduler.name:
+                warm_up_steps = (
+                    self.config.lr_scheduler.params.warm_up_proportion
+                    * (len(train_dataset) // batch_size)
+                )
+                num_training_steps = len(train_dataset) // batch_size
                 scheduler = ConfigMapper.get_object(
                     "schedulers", self.config.lr_scheduler.name
-                )(optimizer,warm_up_steps,num_training_steps)
+                )(optimizer, warm_up_steps, num_training_steps)
                 logger.debug(
                     f"Created scheduler {scheduler.__class__.__name__} with "
                     f"config: {self.config.lr_scheduler.params}"
                 )
-            else :
+            else:
                 scheduler = ConfigMapper.get_object(
                     "schedulers", self.config.lr_scheduler.name
                 )(optimizer, **self.config.lr_scheduler.params.as_dict())
diff --git a/anemic/utils/__init__.py b/anemic/utils/__init__.py
new file mode 100755
index 0000000..e796567
--- /dev/null
+++ b/anemic/utils/__init__.py
@@ -0,0 +1,12 @@
+from anemic.utils import (
+    checkpoint_savers,
+    code_based_filtering,
+    configuration,
+    file_loaders,
+    graph_writers,
+    import_related_ops,
+    mapper,
+    misc,
+    model_utils,
+    text_loggers,
+)
diff --git a/src/utils/checkpoint_savers.py b/anemic/utils/checkpoint_savers.py
similarity index 96%
rename from src/utils/checkpoint_savers.py
rename to anemic/utils/checkpoint_savers.py
index deaabfa..f17d96d 100644
--- a/src/utils/checkpoint_savers.py
+++ b/anemic/utils/checkpoint_savers.py
@@ -6,10 +6,10 @@
 
 import torch
 
-from src.modules.metrics import load_metric
-from src.utils.file_loaders import load_json, save_json
-from src.utils.mapper import ConfigMapper
-from src.utils.text_loggers import get_logger
+from anemic.modules.metrics import load_metric
+from anemic.utils.file_loaders import load_json, save_json
+from anemic.utils.mapper import ConfigMapper
+from anemic.utils.text_loggers import get_logger
 
 logger = get_logger(__name__)
 
@@ -186,7 +186,7 @@ def save_ckpt(
     def load_ckpt(self, model, ckpt_fname, optimizer=None):
         ckpt_fpath = os.path.join(self.config.checkpoint_dir, ckpt_fname)
         logger.debug(f"Loading ckpt from {ckpt_fpath}")
-        checkpoint = torch.load(ckpt_fpath, map_location='cpu')
+        checkpoint = torch.load(ckpt_fpath, map_location="cpu")
         model.load_state_dict(checkpoint["model"])
         if optimizer:
             optimizer.load_state_dict(checkpoint["optimizer"])
diff --git a/src/utils/code_based_filtering.py b/anemic/utils/code_based_filtering.py
similarity index 95%
rename from src/utils/code_based_filtering.py
rename to anemic/utils/code_based_filtering.py
index 1d845c4..eb2bff8 100644
--- a/src/utils/code_based_filtering.py
+++ b/anemic/utils/code_based_filtering.py
@@ -5,8 +5,8 @@
 import pandas as pd
 from sklearn.preprocessing import MultiLabelBinarizer
 
-from src.utils.file_loaders import save_json
-from src.utils.text_loggers import get_logger
+from anemic.utils.file_loaders import save_json
+from anemic.utils.text_loggers import get_logger
 
 logger = get_logger(__name__)
 
diff --git a/src/utils/configuration.py b/anemic/utils/configuration.py
similarity index 95%
rename from src/utils/configuration.py
rename to anemic/utils/configuration.py
index 11e6ccc..eccee07 100755
--- a/src/utils/configuration.py
+++ b/anemic/utils/configuration.py
@@ -2,7 +2,7 @@
 
 import yaml
 
-from src.utils.mapper import ConfigMapper
+from anemic.utils.mapper import ConfigMapper
 
 
 def load_yaml(path):
@@ -122,8 +122,10 @@ def __getattr__(self, attr):
             if isinstance(self._config[attr], dict):
                 return Config(dic=self._config[attr])
             elif isinstance(self._config[attr], list):
-                return [Config(dic=e) if isinstance(e, dict) else e
-                        for e in self._config[attr]]
+                return [
+                    Config(dic=e) if isinstance(e, dict) else e
+                    for e in self._config[attr]
+                ]
             else:
                 return self._config[attr]
         else:
diff --git a/src/utils/file_loaders.py b/anemic/utils/file_loaders.py
similarity index 95%
rename from src/utils/file_loaders.py
rename to anemic/utils/file_loaders.py
index 7f48210..38a1f6b 100644
--- a/src/utils/file_loaders.py
+++ b/anemic/utils/file_loaders.py
@@ -4,7 +4,7 @@
 
 import pandas as pd
 
-from src.utils.text_loggers import get_logger
+from anemic.utils.text_loggers import get_logger
 
 logger = get_logger(__name__)
 
diff --git a/src/utils/graph_writers.py b/anemic/utils/graph_writers.py
similarity index 89%
rename from src/utils/graph_writers.py
rename to anemic/utils/graph_writers.py
index 840ad44..06d08e2 100755
--- a/src/utils/graph_writers.py
+++ b/anemic/utils/graph_writers.py
@@ -4,8 +4,8 @@
 import torch
 from torch.utils.tensorboard import SummaryWriter
 
-from src.utils.mapper import ConfigMapper
-from src.utils.text_loggers import get_logger
+from anemic.utils.mapper import ConfigMapper
+from anemic.utils.text_loggers import get_logger
 
 logger = get_logger(__name__)
 
diff --git a/src/utils/import_related_ops.py b/anemic/utils/import_related_ops.py
similarity index 100%
rename from src/utils/import_related_ops.py
rename to anemic/utils/import_related_ops.py
diff --git a/src/utils/mapper.py b/anemic/utils/mapper.py
similarity index 100%
rename from src/utils/mapper.py
rename to anemic/utils/mapper.py
diff --git a/src/utils/misc.py b/anemic/utils/misc.py
similarity index 96%
rename from src/utils/misc.py
rename to anemic/utils/misc.py
index 03f76d4..ea182c6 100755
--- a/src/utils/misc.py
+++ b/anemic/utils/misc.py
@@ -210,4 +210,3 @@ def html_word_importance(words, importances):
     tags.append("</div>")
 
     return "".join(tags)
-
diff --git a/src/utils/model_utils.py b/anemic/utils/model_utils.py
similarity index 97%
rename from src/utils/model_utils.py
rename to anemic/utils/model_utils.py
index ef672b1..74f86ab 100644
--- a/src/utils/model_utils.py
+++ b/anemic/utils/model_utils.py
@@ -9,9 +9,9 @@
 
 import numpy as np
 
-from src.modules.preprocessors import CodeProcessor
-from src.utils.file_loaders import load_csv_as_df, load_json
-from src.utils.mapper import ConfigMapper
+from anemic.modules.preprocessors import CodeProcessor
+from anemic.utils.file_loaders import load_csv_as_df, load_json
+from anemic.utils.mapper import ConfigMapper
 
 
 def load_lookups(
diff --git a/src/utils/text_loggers.py b/anemic/utils/text_loggers.py
similarity index 100%
rename from src/utils/text_loggers.py
rename to anemic/utils/text_loggers.py
diff --git a/app.py b/app.py
index 5687010..db1d2a5 100755
--- a/app.py
+++ b/app.py
@@ -6,21 +6,17 @@
 import copy
 import csv
 
-from captum.attr import LayerIntegratedGradients
 import numpy as np
 import pandas as pd
 import seaborn as sns
 import streamlit as st
 import torch
+from captum.attr import LayerIntegratedGradients
 
-from src.datasets import *
-from src.models import *
-from src.modules.embeddings import *
-from src.modules.preprocessors import ClinicalNotePreprocessor
-from src.utils.checkpoint_savers import *
-from src.utils.configuration import Config
-from src.utils.mapper import ConfigMapper
-from src.utils.misc import html_word_importance
+from anemic.modules.preprocessors import ClinicalNotePreprocessor
+from anemic.utils.configuration import Config
+from anemic.utils.mapper import ConfigMapper
+from anemic.utils.misc import html_word_importance
 
 hash_funcs = {
     Config: lambda x: hash(str(x)),
@@ -91,7 +87,7 @@ def load_modules(config):
     # 3. Load model
     model_dict = {}
     lig_dict = {}
-    if hasattr(config, 'models'):
+    if hasattr(config, "models"):
         model_configs = config.models
     else:
         model_configs = [config]
@@ -121,8 +117,10 @@ def load_modules(config):
             )
             embed_layer = getattr(model, embed_layer_name)
         except:
-            raise ValueError(f"Config for {model_config.model.name} does not"
-                              "specify name of the embedding layer.")
+            raise ValueError(
+                f"Config for {model_config.model.name} does not"
+                "specify name of the embedding layer."
+            )
         lig = LayerIntegratedGradients(model, embed_layer)
 
         model_dict[model_config.model.name] = model
@@ -138,6 +136,7 @@ def load_icd_desc(config):
     icd_desc = {r[0]: r[1] for r in icd_desc}
     return icd_desc
 
+
 # Page setup
 st.set_page_config(
     page_title="ICD Coding Interactive Demo",
@@ -145,7 +144,8 @@ def load_icd_desc(config):
     layout="wide",
 )
 
-st.markdown("""
+st.markdown(
+    """
 <style>
 div.stButton > button:first-child {
     background-color: rgb(255, 75, 75);
@@ -155,7 +155,9 @@ def load_icd_desc(config):
     padding-right: 20px;
 }
 .streamlit-expanderHeader { font-size: medium; }
-</style>""", unsafe_allow_html=True)
+</style>""",
+    unsafe_allow_html=True,
+)
 
 # Title & status line
 st.title("🩺 ICD Coding Interactive Demo")
@@ -223,15 +225,17 @@ def set_status(text):
             "NO",
             "Integrated Gradients",
         ]
-        if any(hasattr(model, "get_input_attention") for model in
-               model_dict.values()):
+        if any(
+            hasattr(model, "get_input_attention")
+            for model in model_dict.values()
+        ):
             vis_score_options.append("Attention score")
 
         vis_score = st.radio(
             "Visualize attribution score",
             vis_score_options,
             help="""Interpretability visualization methods. Attention score is
-            available only for attention-based models."""
+            available only for attention-based models.""",
         )
 
         vis_code_options = ["Choose ICD code"]
@@ -245,8 +249,10 @@ def set_status(text):
         )
 
         # Preprocessing option selection (truncation is not controlled)
-        st.markdown("""<p style="font-size: small;"> Preprocessing </p>""",
-                    unsafe_allow_html=True)
+        st.markdown(
+            """<p style="font-size: small;"> Preprocessing </p>""",
+            unsafe_allow_html=True,
+        )
         pp_config = config.clinical_note_preprocessing
         pp_lower_case = st.checkbox(
             "Lowercase",
@@ -273,8 +279,10 @@ def set_status(text):
     with col2:
         # Input text
         css_str = "line-height:1; margin-top:1rem; margin-bottom:-2rem;"
-        st.markdown(f"""<div style="{css_str}">Discharge summary note</div>""",
-                    unsafe_allow_html=True)
+        st.markdown(
+            f"""<div style="{css_str}">Discharge summary note</div>""",
+            unsafe_allow_html=True,
+        )
         input_text = st.text_area(label="", height=200)
         # input_text = st.text_area(label="Discharge summary note", height=200)
         input_text = input_text.strip()
@@ -337,7 +345,7 @@ def set_status(text):
                 }
             )
             output_df.index += 1
-            cmap = sns.light_palette('#AC304B', as_cmap=True)
+            cmap = sns.light_palette("#AC304B", as_cmap=True)
             output_df = output_df.style.background_gradient(
                 cmap=cmap, subset=["Probability"], vmin=0.0, vmax=1.0
             ).format({"Probability": "{:.4f}"})
@@ -350,8 +358,9 @@ def set_status(text):
                     st.markdown("**[No attribution method selected]**")
                 elif target_label == -1:
                     st.markdown("**[No ICD code selected]**")
-                elif (vis_score == "Attention score" and
-                      not hasattr(model, "get_input_attention")):
+                elif vis_score == "Attention score" and not hasattr(
+                    model, "get_input_attention"
+                ):
                     st.markdown("**[Model does not support attention score]**")
                 else:
                     if vis_score == "Integrated Gradients":
@@ -359,10 +368,12 @@ def set_status(text):
                         attrs, approx_error = lig.attribute(
                             batch_input,
                             target=target_label,
-                            return_convergence_delta=True
+                            return_convergence_delta=True,
                         )
                         attrs = attrs.sum(dim=2).squeeze(0)
-                        attrs = (attrs/torch.norm(attrs)).cpu().detach().numpy()
+                        attrs = (
+                            (attrs / torch.norm(attrs)).cpu().detach().numpy()
+                        )
                     elif vis_score == "Attention score":
                         attrs = model.get_input_attention()
                         attrs = attrs[:, target_label].squeeze(0)
@@ -372,8 +383,10 @@ def set_status(text):
 
                     assert len(attrs) >= len(tokens)
                     html_string = html_word_importance(tokens, attrs)
-                    st.markdown(f"**{vis_score}** for **{vis_code}** "
-                                f"({icd_desc[vis_code]})")
+                    st.markdown(
+                        f"**{vis_score}** for **{vis_code}** "
+                        f"({icd_desc[vis_code]})"
+                    )
                     st.markdown(html_string, unsafe_allow_html=True)
                     st.markdown("")
         elif token_idxs and len(token_idxs) < config.demo.min_input_len:
diff --git a/run.py b/run.py
index 09a318e..4747d16 100644
--- a/run.py
+++ b/run.py
@@ -6,13 +6,10 @@
 import torch
 from torchsummaryX import summary
 
-from src.datasets import *
-from src.models import *
-from src.trainers import *
-from src.utils.configuration import Config
-from src.utils.import_related_ops import pandas_related_ops
-from src.utils.mapper import ConfigMapper
-from src.utils.misc import seed
+from anemic.utils.configuration import Config
+from anemic.utils.import_related_ops import pandas_related_ops
+from anemic.utils.mapper import ConfigMapper
+from anemic.utils.misc import seed
 
 pandas_related_ops()
 
diff --git a/run_preprocessing.py b/run_preprocessing.py
index d6cdf43..5cbef72 100644
--- a/run_preprocessing.py
+++ b/run_preprocessing.py
@@ -1,8 +1,7 @@
 import argparse
 
-from src.modules.preprocessing_pipelines import *
-from src.utils.configuration import Config
-from src.utils.mapper import ConfigMapper
+from anemic.utils.configuration import Config
+from anemic.utils.mapper import ConfigMapper
 
 # Command line arguments
 parser = argparse.ArgumentParser(description="Preprocessing datasets")
diff --git a/src/__init__.py b/src/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py
deleted file mode 100644
index 227088f..0000000
--- a/src/datasets/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from src.datasets.base_dataset import BaseDataset
diff --git a/src/models/__init__.py b/src/models/__init__.py
deleted file mode 100644
index ec43bca..0000000
--- a/src/models/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from src.models.caml import ConvAttnPool as CAML
-from src.models.caml import VanillaConv as CNN
-from src.models.multirescnn import MultiResCNN
-from src.models.dcan import DCAN
-from src.models.fusion import Fusion
-from src.models.transicd.model import TransICD
diff --git a/src/models/auto_models.py b/src/models/auto_models.py
deleted file mode 100644
index 6998953..0000000
--- a/src/models/auto_models.py
+++ /dev/null
@@ -1 +0,0 @@
-from src.utils.mapper import ConfigMapper
diff --git a/src/models/dcan.py b/src/models/dcan.py
deleted file mode 100644
index c0ca9b3..0000000
--- a/src/models/dcan.py
+++ /dev/null
@@ -1,154 +0,0 @@
-import copy
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.init import xavier_uniform_
-
-from src.modules.activations import *
-from src.modules.layers.label_wise_attn import LabelWiseAttn
-from src.modules.layers.temporal_conv_net import TemporalConvNet
-from src.modules.layers.word_embedding_layer import WordEmbeddingLayer
-from src.utils.mapper import ConfigMapper
-from src.utils.text_loggers import get_logger
-
-logger = get_logger(__name__)
-
-
-@ConfigMapper.map("models", "dcan")
-class DCAN(nn.Module):
-    """
-    This class is used to create the DCAN model.
-    References:
-        Paper: https://aclanthology.org/2020.clinicalnlp-1.8/
-        GitHub Repository: https://github.com/shaoxiongji/DCAN
-    For the parameters related to convolutional layers, please see this:
-    https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html.
-    Args:
-        num_classes (int): Number of classes (ICD codes).
-        conv_channel_sizes (list): List of lists of integers. Each list
-                                   represents the channel sizes of convolutional
-                                   layers in a `TemporalBlock`. So, for example,
-                                   if the list is [[100, 600, 600],
-                                                   [600, 600, 600]].
-                                   the `TemporalConvNet` layer will have 2
-                                   `TemporalBlock`s, each temporal block have
-                                   2 convolutional layers:
-                                   Conv(100, 600), Conv(600, 600) for the first
-                                   one, and Conv(600, 600), Conv(600, 600). If
-                                   the `add_emb_size_to_channel_sizes`, we don't
-                                   have to pass the input channel size. So, in
-                                   the above case, we can just pass
-                                   [[600, 600], [600, 600, 600]].
-        add_emb_size_to_channel_sizes (bool): If True, you need not specify
-                                              the input channel size. Please
-                                              see the description of
-                                              `conv_channel_sizes`.
-        kernel_sizes (list): List of list of integers (same format as
-                             `conv_channel_sizes`). Each integer represents the
-                             kernel size/filter size of the respective
-                             convolutional layer in `TemporalBlock` layer.
-        strides (list): List of list of integers (same format as
-                        `conv_channel_sizes`). Each integer represents the
-                        stride of the respective convolutional layer in
-                        `TemporalBlock` layer.
-        paddings (list): List of list of integers (same format as
-                         `conv_channel_sizes`). Each integer represents the
-                         padding of the respective convolutional layer in
-                         `TemporalBlock` layer. in DCAN, this value is set to
-                         "(kernel_size - 1) * dilation_size".
-        dilations (list): List of list of integers (same format as
-                          `conv_channel_sizes`). Each integer represents the
-                          dilation size of the respective convolutional layer
-                          `TemporalBlock` layer.` In DCAN, this value is
-                          "2^(temporal_block_level)".
-        dropouts (list): List of list of floats (same format as
-                         `conv_channel_sizes`). Each float represents the
-                         dropout probability of the respective convolutional
-                         `TemporalBlock` layer.
-        weight_norm (bool): If True, apply weight normalization to the
-                            convolutional layers.
-        activation (str): Activation function to use. Should be one of "relu",
-                          "elu", "leaky_relu".
-    """
-
-    def __init__(self, config):
-        super(DCAN, self).__init__()
-        logger.info(f"Initialising {self.__class__.__name__}")
-        logger.debug(
-            f"Initialising {self.__class__.__name__} with " f"config: {config}"
-        )
-
-        self.config = config
-
-        self.word_embedding_layer = WordEmbeddingLayer(
-            **config.word_representation_layer.params.init_params.as_dict()
-        )
-        if config.word_representation_layer.params.freeze_layer:
-            self.freeze_layer(self.word_embedding_layer.embed)
-
-        num_levels = len(config.kernel_sizes)
-        num_inner_conv_levels = len(config.kernel_sizes[0])
-
-        conv_channel_sizes = copy.deepcopy(config.conv_channel_sizes)
-        if config.add_emb_size_to_channel_sizes:
-            conv_channel_sizes[0] = [
-                self.word_embedding_layer.embedding_size
-            ] + conv_channel_sizes[0]
-        dropouts = [
-            [config.dropout for _ in range(num_inner_conv_levels)]
-            for _ in range(num_levels)
-        ]
-
-        self.temporal_conv_net = TemporalConvNet(
-            conv_channel_sizes_=conv_channel_sizes,
-            kernel_sizes_=config.kernel_sizes,
-            strides_=config.strides,
-            paddings_=config.paddings,
-            dilations_=config.dilations,
-            dropouts_=dropouts,
-            weight_norm=config.weight_norm,
-            activation=config.activation,
-        )
-
-        self.linear_layer = nn.Linear(
-            conv_channel_sizes[-1][-1], config.projection_size
-        )
-        self.activation = ConfigMapper.get_object(
-            "activations", config.activation
-        )()
-
-        self.output_layer = OutputLayer(
-            config.projection_size, config.num_classes
-        )
-
-        xavier_uniform_(self.linear_layer.weight)
-
-    def forward(self, data):
-        x = self.word_embedding_layer(data)
-        hid_seq = self.temporal_conv_net(x.transpose(1, 2)).transpose(1, 2)
-        hid_seq = self.activation(self.linear_layer(hid_seq))
-        logits = self.output_layer(hid_seq)
-        return logits
-
-    def freeze_layer(self, layer):
-        for param in layer.parameters():
-            param.requires_grad = False
-
-    def get_input_attention(self):
-        # Use the attention score computed in the forward pass
-        return self.output_layer.label_wise_attn.alpha.cpu().detach().numpy()
-
-
-class OutputLayer(nn.Module):
-    def __init__(self, input_size, num_classes):
-        super(OutputLayer, self).__init__()
-        self.label_wise_attn = LabelWiseAttn(input_size, num_classes)
-
-        self.final = nn.Linear(input_size, num_classes)
-        xavier_uniform_(self.final.weight)
-
-    def forward(self, x):
-        m = self.label_wise_attn(x)
-        logits = self.final.weight.mul(m).sum(dim=2).add(self.final.bias)
-        return logits
diff --git a/src/models/transicd/__init__.py b/src/models/transicd/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/modules/__init__.py b/src/modules/__init__.py
deleted file mode 100755
index e69de29..0000000
diff --git a/src/modules/layers/__init__.py b/src/modules/layers/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/modules/layers/label_wise_attn.py b/src/modules/layers/label_wise_attn.py
deleted file mode 100644
index ed98e89..0000000
--- a/src/modules/layers/label_wise_attn.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# flake8: noqa
-
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.init import xavier_uniform_
-
-from src.utils.text_loggers import get_logger
-
-logger = get_logger(__name__)
-
-
-class LabelWiseAttn(nn.Module):
-    """
-    A Label-wise Attention layer (as implemented in CAML, DCAN, etc.).
-    References:
-        Papers: https://arxiv.org/abs/1802.05695 (Section 2.2)
-        Repository: https://github.com/jamesmullenbach/caml-mimic/blob/master/learn/models.py#L184
-
-    Args:
-        input_size (int): The size of the input, i.e., the number of channels
-                          if the output is from a convolutional layer/embedding
-                          size if the output is from a fully connected layer.
-        num_classes (int): The number of classes.
-    """
-
-    def __init__(self, input_size, num_classes):
-        super(LabelWiseAttn, self).__init__()
-        logger.debug(
-            f"Initialising {self.__class__.__name__} with "
-            f"input size = {input_size}, num_classes = {num_classes}"
-        )
-
-        self.U = nn.Linear(input_size, num_classes)
-        xavier_uniform_(self.U.weight)
-
-    def forward(self, x):
-        att = self.U.weight.matmul(x.transpose(1, 2))  # [bs, Y, seq_len]
-        self.alpha = F.softmax(att, dim=2)
-        m = self.alpha.matmul(x)  # [bs, Y, dim]
-        return m
diff --git a/src/modules/layers/temporal_block.py b/src/modules/layers/temporal_block.py
deleted file mode 100644
index cb4c598..0000000
--- a/src/modules/layers/temporal_block.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# flake8: noqa
-
-import torch.nn as nn
-from torch.nn.init import xavier_uniform_
-from torch.nn.utils import weight_norm as weight_norm_
-
-from src.modules.activations import *
-from src.utils.mapper import ConfigMapper
-from src.utils.text_loggers import get_logger
-
-logger = get_logger(__name__)
-
-
-class Chomp1d(nn.Module):
-    def __init__(self, chomp_size):
-        super(Chomp1d, self).__init__()
-        self.chomp_size = chomp_size
-
-    def forward(self, x):
-        return x[:, :, : -self.chomp_size].contiguous()
-
-
-class ConvTemporalSubBlock(nn.Module):
-    """
-    A simple temporal convolutional block. Adapted from
-    https://github.com/shaoxiongji/DCAN/blob/master/models.py#L84-L88. This
-    layer has a dilated convolutional layer, a `chomp1d` layer, followed by
-    activation and dropout. For the parameters related to convolutional layers,
-    please see this:
-    https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html.
-
-    Args:
-        in_channels (int): The number of input channels in the convolutional
-                           layer.
-        out_channels (int): The number of output channels in the convolutional
-                            layer.
-        kernel_size (int): The size of the kernel in the convolutional layer.
-        stride (int): The stride of the convolutional layer.
-        padding (int): The padding of the convolutional layer.
-        dilation (int): The dilation size of the convolutional layer.
-        dropout (float): The dropout probability.
-        weight_norm (bool): Whether to apply weight normalization to the
-                            convolutional layer.
-        activation (str): The activation function to use. DCAN uses "relu".
-                          For all available activations, see
-                          https://github.com/dalgu90/icd-coding-benchmark/blob/main/src/modules/activations.py.
-    """
-
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride,
-        padding,
-        dilation,
-        dropout=0.2,
-        weight_norm=True,
-        activation="relu",
-    ):
-        super(ConvTemporalSubBlock, self).__init__()
-        logger.debug(
-            f"Initialising {self.__class__.__name__} with "
-            f"in_channels = {in_channels}, out_channels = "
-            f"{out_channels}, kernel_size = {kernel_size}, "
-            f"stride = {stride}, padding = {padding}, "
-            f"dilation = {dilation}, dropout = {dropout}, "
-            f"weight_norm = {weight_norm}, activation = {activation}"
-        )
-
-        self.conv_layer = nn.Conv1d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-        )
-        if weight_norm:
-            self.conv_layer = weight_norm_(self.conv_layer)
-        self.chomp1d = Chomp1d(padding)
-        self.activation = ConfigMapper.get_object("activations", activation)()
-        self.dropout = nn.Dropout(dropout)
-
-        self.__init_weights__()
-
-    def __init_weights__(self):
-        xavier_uniform_(self.conv_layer.weight)
-
-    def forward(self, x):
-        x = self.conv_layer(x)
-        x = self.chomp1d(x)
-        x = self.activation(x)
-        x = self.dropout(x)
-        return x
-
-
-class TemporalBlock(nn.Module):
-    """
-    A Temporal Block containing stacks of `ConvTemporalSubBlocks`, followed
-    by activation.
-    References:
-        Paper: https://arxiv.org/abs/2009.14578
-        Repository: https://github.com/shaoxiongji/DCAN/blob/master/models.py#L81
-
-    Args:
-        conv_channel_sizes (list): List of integers, with channel sizes of
-                                   convolutional layers. For example, if the
-                                   list is [100, 200, 300], there will be two
-                                   convolutional layers: Conv1d(100, 200) and
-                                   Conv1d(200, 300).
-        kernel_sizes (list): List of integers, with kernel sizes of every
-                             `ConvTemporalSubBlock`.
-        strides (list): List of integers, with strides of convolutional layers.
-        paddings (list): List of integers, with paddings of every
-                         `ConvTemporalSubBlock`.
-        dilations (list): List of integers, with dilation sizes of every
-                          `ConvTemporalSubBlock`.
-        dropouts (list): List of floats, with dropout probabilities of every
-                         `ConvTemporalSubBlock`.
-        weight_norm (bool): Whether to apply weight normalization to every
-                             convolutional layer. DCAN uses weight norm.
-        activation (str): The activation function to use. DCAN uses "relu".
-    """
-
-    def __init__(
-        self,
-        conv_channel_sizes,
-        kernel_sizes,
-        strides,
-        paddings,
-        dilations,
-        dropouts,
-        weight_norm=True,
-        activation="relu",
-    ):
-        super(TemporalBlock, self).__init__()
-        conv_channel_size_pairs = list(
-            zip(conv_channel_sizes[:-1], conv_channel_sizes[1:])
-        )
-
-        self.conv_temporal_sub_blocks = nn.ModuleList(
-            [
-                ConvTemporalSubBlock(
-                    in_channels=conv_channel_size_pair[0],
-                    out_channels=conv_channel_size_pair[1],
-                    kernel_size=kernel_size,
-                    stride=stride,
-                    padding=padding,
-                    dilation=dilation,
-                    dropout=dropout,
-                    weight_norm=weight_norm,
-                    activation=activation,
-                )
-                for (
-                    conv_channel_size_pair,
-                    kernel_size,
-                    stride,
-                    padding,
-                    dilation,
-                    dropout,
-                ) in zip(
-                    conv_channel_size_pairs,
-                    kernel_sizes,
-                    strides,
-                    paddings,
-                    dilations,
-                    dropouts,
-                )
-            ]
-        )
-
-        self.downsample = (
-            nn.Conv1d(conv_channel_sizes[0], conv_channel_sizes[-1], 1)
-            if conv_channel_sizes[0] != conv_channel_sizes[-1]
-            else None
-        )
-        self.output_activation = ConfigMapper.get_object(
-            "activations", activation
-        )()
-
-        self.init_weights()
-
-    def init_weights(self):
-        if self.downsample is not None:
-            xavier_uniform_(self.downsample.weight)
-
-    def forward(self, x):
-        conv_layer_output = x
-        for conv_temporal_sub_block in self.conv_temporal_sub_blocks:
-            conv_layer_output = conv_temporal_sub_block(conv_layer_output)
-        res = x if self.downsample is None else self.downsample(x)
-        return self.output_activation(conv_layer_output + res)
diff --git a/src/modules/layers/temporal_conv_net.py b/src/modules/layers/temporal_conv_net.py
deleted file mode 100644
index 3222a95..0000000
--- a/src/modules/layers/temporal_conv_net.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# flake8: noqa
-
-import torch.nn as nn
-
-from src.modules.activations import *
-from src.modules.layers.temporal_block import TemporalBlock
-from src.utils.text_loggers import get_logger
-
-logger = get_logger(__name__)
-
-
-class TemporalConvNet(nn.Module):
-    """
-    Stack of `TemporalBlock`s. Used in the DCAN model.
-    References:
-        Paper: https://arxiv.org/abs/2009.14578
-        Repository: https://github.com/shaoxiongji/DCAN/blob/master/models.py#L114
-
-    Args:
-        conv_channel_sizes_ (list): List of lists of integers. Each list
-                                    represents the channel sizes of convolutional
-                                    layers in a `TemporalBlock`. So, for
-                                    example, if the list is [[100, 600, 600],
-                                                             [600, 600, 600]].
-                                    the `TemporalConvNet` layer will have 2
-                                    `TemporalBlock`s, each temporal block have
-                                    2 convolutional layers:
-                                    Conv(100, 600), Conv(600, 600) for the first
-                                    one, and Conv(600, 600), Conv(600, 600). If
-                                    the `add_emb_size_to_channel_sizes`, we
-                                    don't have to pass the input channel size.
-                                    So, in the above case, we can just pass
-                                    [[600, 600], [600, 600, 600]].
-        kernel_sizes_ (list): List of list of integers (same format as
-                              `conv_channel_sizes`). Each integer represents the
-                              kernel size/filter size of the respective
-                              convolutional layer in `TemporalBlock` layer.
-        strides_ (list): List of list of integers (same format as
-                         `conv_channel_sizes`). Each integer represents the
-                         stride of the respective convolutional layer in
-                         `TemporalBlock` layer.
-        paddings_ (list): List of list of integers (same format as
-                          `conv_channel_sizes`). Each integer represents the
-                          padding of the respective convolutional layer in
-                          `TemporalBlock` layer. in DCAN, this value is set to
-                          "(kernel_size - 1) * dilation_size".
-        dilations_ (list): List of list of integers (same format as
-                           `conv_channel_sizes`). Each integer represents the
-                           dilation size of the respective convolutional layer
-                           `TemporalBlock` layer.` In DCAN, this value is
-                           "2^(temporal_block_level)".
-        dropouts_ (list): List of list of floats (same format as
-                          `conv_channel_sizes`). Each float represents the
-                          dropout probability of the respective convolutional
-                          `TemporalBlock` layer.
-        weight_norm (bool): If True, apply weight normalization to the
-                            convolutional layers.
-        activation (str): Activation function to use. DCAN uses "relu".
-    """
-
-    def __init__(
-        self,
-        conv_channel_sizes_,
-        kernel_sizes_,
-        strides_,
-        paddings_,
-        dilations_,
-        dropouts_,
-        weight_norm=True,
-        activation="relu",
-    ):
-        super(TemporalConvNet, self).__init__()
-        logger.debug(
-            f"Initialising {self.__class__.__name__} with "
-            f"conv_channel_sizes_ = {conv_channel_sizes_}, "
-            f"kernel_sizes_ = {kernel_sizes_}, "
-            f"strides_ = {strides_}, paddings_ = {paddings_}, "
-            f"dilations_ = {dilations_}, dropouts_ = {dropouts_}, "
-            f"weight_norm = {weight_norm}, activation = {activation}"
-        )
-
-        self.temporal_blocks = nn.ModuleList(
-            [
-                TemporalBlock(
-                    conv_channel_sizes=conv_channel_sizes,
-                    kernel_sizes=kernel_sizes,
-                    strides=strides,
-                    paddings=paddings,
-                    dilations=dilations,
-                    dropouts=dropouts,
-                    weight_norm=weight_norm,
-                    activation=activation,
-                )
-                for (
-                    conv_channel_sizes,
-                    kernel_sizes,
-                    strides,
-                    paddings,
-                    dilations,
-                    dropouts,
-                ) in zip(
-                    conv_channel_sizes_,
-                    kernel_sizes_,
-                    strides_,
-                    paddings_,
-                    dilations_,
-                    dropouts_,
-                )
-            ]
-        )
-
-    def forward(self, x):
-        for temporal_block in self.temporal_blocks:
-            x = temporal_block(x)
-        return x
diff --git a/src/modules/layers/word_embedding_layer.py b/src/modules/layers/word_embedding_layer.py
deleted file mode 100644
index ecdfd43..0000000
--- a/src/modules/layers/word_embedding_layer.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-import torch.nn as nn
-
-from src.utils.mapper import ConfigMapper
-from src.utils.text_loggers import get_logger
-
-logger = get_logger(__name__)
-
-
-class WordEmbeddingLayer(nn.Module):
-    """
-    A Word Embedding Layer. This layer loads a pre-trained word embedding matrix
-    , and copies its weights to an nn.Embedding layer.
-
-    Args:
-        embed_dir (str): A directory containing the pre-trained word embedding
-                         matrix, among other things. Please see
-                         https://github.com/dalgu90/icd-coding-benchmark/blob/main/src/modules/embeddings.py#L17
-                         for more details.
-        dropout (float): The dropout probability.
-    """
-
-    def __init__(self, embed_dir, dropout):
-        super(WordEmbeddingLayer, self).__init__()
-        logger.debug(
-            f"Initialising {self.__class__.__name__} with "
-            f"embed_dir = {embed_dir}, dropout = {dropout}"
-        )
-
-        # Note: This should be changed, since we won't always use Word2Vec.
-        embedding_cls = ConfigMapper.get_object("embeddings", "word2vec")
-
-        W = torch.Tensor(embedding_cls.load_emb_matrix(embed_dir))
-        self.embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0)
-        self.embed.weight.data = W.clone()
-
-        self.embedding_size = self.embed.embedding_dim
-
-        self.dropout = nn.Dropout(dropout)
-
-    def forward(self, x):
-        embedding = self.embed(x)
-        x = self.dropout(embedding)
-        return x
diff --git a/src/trainers/__init__.py b/src/trainers/__init__.py
deleted file mode 100644
index c6668cb..0000000
--- a/src/trainers/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from src.trainers.base_trainer import BaseTrainer
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
deleted file mode 100755
index e69de29..0000000