Merge branch 'master' into export-D41830245

NarineK · web-flow · commit 31e798347b48 · 2023-11-22T16:46:07.000-08:00
diff --git a/captum/attr/__init__.py b/captum/attr/__init__.py
@@ -30,6 +30,7 @@
 )
 from captum.attr._core.layer.layer_lrp import LayerLRP  # noqa
 from captum.attr._core.lime import Lime, LimeBase  # noqa
+from captum.attr._core.llm_attr import LLMAttribution, LLMGradientAttribution  # noqa
 from captum.attr._core.lrp import LRP  # noqa
 from captum.attr._core.neuron.neuron_conductance import NeuronConductance  # noqa
 from captum.attr._core.neuron.neuron_deep_lift import (  # noqa
@@ -67,6 +68,11 @@
     PerturbationAttribution,
 )
 from captum.attr._utils.class_summarizer import ClassSummarizer
+from captum.attr._utils.interpretable_input import (  # noqa
+    InterpretableInput,
+    TextTemplateInput,
+    TextTokenInput,
+)
 from captum.attr._utils.stat import (
     CommonStats,
     Count,
@@ -108,7 +114,10 @@
     "LayerGradientXActivation",
     "LayerActivation",
     "LayerFeatureAblation",
+    "LLMAttribution",
+    "LLMGradientAttribution",
     "InternalInfluence",
+    "InterpretableInput",
     "LayerGradCam",
     "LayerDeepLift",
     "LayerDeepLiftShap",
@@ -127,6 +136,8 @@
     "NoiseTunnel",
     "GradientShap",
     "InterpretableEmbeddingBase",
+    "TextTemplateInput",
+    "TextTokenInput",
     "TokenReferenceBase",
     "visualization",
     "configure_interpretable_embedding_layer",
diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -467,6 +467,8 @@ def layer_forward_hook(
 
                         hooks.append(hook)
 
+                    # the inputs is an empty tuple
+                    # coz it is prepended into additional_forward_args
                     output = _run_forward(
                         self.forward_func, tuple(), target_ind, additional_forward_args
                     )
diff --git a/captum/attr/_core/llm_attr.py b/captum/attr/_core/llm_attr.py
@@ -4,15 +4,17 @@
 
 import torch
 from captum.attr._core.feature_ablation import FeatureAblation
+from captum.attr._core.layer.layer_integrated_gradients import LayerIntegratedGradients
 from captum.attr._core.shapley_value import ShapleyValues, ShapleyValueSampling
 from captum.attr._utils.attribution import Attribution
-from captum.attr._utils.interpretable_input import InterpretableInput, TextTemplateInput
+from captum.attr._utils.interpretable_input import (
+    InterpretableInput,
+    TextTemplateInput,
+    TextTokenInput,
+)
 from torch import nn, Tensor
 
 
-SUPPORTED_METHODS = (FeatureAblation, ShapleyValueSampling, ShapleyValues)
-SUPPORTED_INPUTS = (TextTemplateInput,)
-
 DEFAULT_GEN_ARGS = {"max_new_tokens": 25, "do_sample": False}
 
 
@@ -57,6 +59,9 @@ class LLMAttribution(Attribution):
     and returns LLMAttributionResult
     """
 
+    SUPPORTED_METHODS = (FeatureAblation, ShapleyValueSampling, ShapleyValues)
+    SUPPORTED_INPUTS = (TextTemplateInput, TextTokenInput)
+
     def __init__(
         self,
         attr_method: Attribution,
@@ -75,7 +80,7 @@ class created with the llm model that follows huggingface style
         """
 
         assert isinstance(
-            attr_method, SUPPORTED_METHODS
+            attr_method, self.SUPPORTED_METHODS
         ), f"LLMAttribution does not support {type(attr_method)}"
 
         super().__init__(attr_method.forward_func)
@@ -86,6 +91,7 @@ class created with the llm model that follows huggingface style
         self.attr_method.forward_func = self._forward_func
 
         # alias, we really need a model and don't support wrapper functions
+        # coz we need call model.forward, model.generate, etc.
         self.model = cast(nn.Module, self.forward_func)
 
         self.tokenizer = tokenizer
@@ -103,14 +109,12 @@ class created with the llm model that follows huggingface style
 
     def _forward_func(
         self,
-        perturbed_feature,
-        input_feature,
+        perturbed_tensor,
+        inp,
         target_tokens,
         _inspect_forward,
     ):
-        perturbed_input = self._format_model_input(
-            input_feature.to_model_input(perturbed_feature)
-        )
+        perturbed_input = self._format_model_input(inp.to_model_input(perturbed_tensor))
         init_model_inp = perturbed_input
 
         model_inp = init_model_inp
@@ -192,7 +196,7 @@ def attribute(
         """
 
         assert isinstance(
-            inp, SUPPORTED_INPUTS
+            inp, self.SUPPORTED_INPUTS
         ), f"LLMAttribution does not support input type {type(inp)}"
 
         if target is None:
@@ -214,6 +218,7 @@ def attribute(
             if type(target) is str:
                 # exclude sos
                 target_tokens = self.tokenizer.encode(target)[1:]
+                target_tokens = torch.tensor(target_tokens)
             elif type(target) is torch.Tensor:
                 target_tokens = target
 
@@ -249,3 +254,195 @@ def attribute(
             inp.values,
             self.tokenizer.convert_ids_to_tokens(target_tokens),
         )
+
+
+class LLMGradientAttribution(Attribution):
+    """
+    Attribution class for large language models. It wraps a gradient-based
+    attribution algorthm to produce commonly interested attribution
+    results for the use case of text generation.
+    The wrapped instance will calculate attribution in the
+    same way as configured in the original attribution algorthm,
+    with respect to the log probabilities of each
+    generated token and the whole sequence. It will provide a
+    new "attribute" function which accepts text-based inputs
+    and returns LLMAttributionResult
+    """
+
+    SUPPORTED_METHODS = (LayerIntegratedGradients,)
+    SUPPORTED_INPUTS = (TextTokenInput,)
+
+    def __init__(
+        self,
+        attr_method,
+        tokenizer,
+    ):
+        """
+        Args:
+            attr_method (Attribution): instance of a supported perturbation attribution
+                    class created with the llm model that follows huggingface style
+                    interface convention
+            tokenizer (Tokenizer): tokenizer of the llm model used in the attr_method
+        """
+        assert isinstance(
+            attr_method, self.SUPPORTED_METHODS
+        ), f"LLMGradientAttribution does not support {type(attr_method)}"
+
+        super().__init__(attr_method.forward_func)
+
+        # shallow copy is enough to avoid modifying original instance
+        self.attr_method = copy(attr_method)
+        self.attr_method.forward_func = self._forward_func
+
+        # alias, we really need a model and don't support wrapper functions
+        # coz we need call model.forward, model.generate, etc.
+        self.model = cast(nn.Module, self.forward_func)
+
+        self.tokenizer = tokenizer
+        self.device = (
+            cast(torch.device, self.model.device)
+            if hasattr(self.model, "device")
+            else next(self.model.parameters()).device
+        )
+
+    def _forward_func(
+        self,
+        perturbed_tensor: Tensor,
+        inp: InterpretableInput,
+        target_tokens: Tensor,  # 1D tensor of target token ids
+        cur_target_idx: int,  # current target index
+    ):
+        perturbed_input = self._format_model_input(inp.to_model_input(perturbed_tensor))
+
+        if cur_target_idx:
+            # the input batch size can be expanded by attr method
+            output_token_tensor = (
+                target_tokens[:cur_target_idx]
+                .unsqueeze(0)
+                .expand(perturbed_input.size(0), -1)
+                .to(self.device)
+            )
+            new_input_tensor = torch.cat([perturbed_input, output_token_tensor], dim=1)
+        else:
+            new_input_tensor = perturbed_input
+
+        output_logits = self.model(new_input_tensor)
+
+        new_token_logits = output_logits.logits[:, -1]
+        log_probs = torch.nn.functional.log_softmax(new_token_logits, dim=1)
+
+        target_token = target_tokens[cur_target_idx]
+        token_log_probs = log_probs[..., target_token]
+
+        # the attribution target is limited to the log probability
+        return token_log_probs
+
+    def _format_model_input(self, model_input):
+        """
+        Convert str to tokenized tensor
+        """
+        return model_input.to(self.device)
+
+    def attribute(
+        self,
+        inp: InterpretableInput,
+        target: Union[str, torch.Tensor, None] = None,
+        gen_args: Optional[Dict] = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            inp (InterpretableInput): input prompt for which attributions are computed
+            target (str or Tensor, optional): target response with respect to
+                    which attributions are computed. If None, it uses the model
+                    to generate the target based on the input and gen_args.
+                    Default: None
+            gen_args (dict, optional): arguments for generating the target. Only used if
+                    target is not given. When None, the default arguments are used,
+                    {"max_length": 25, "do_sample": False}
+                    Defaults: None
+            **kwargs (Any): any extra keyword arguments passed to the call of the
+                    underlying attribute function of the given attribution instance
+
+        Returns:
+
+            attr (LLMAttributionResult): attribution result
+        """
+
+        assert isinstance(
+            inp, self.SUPPORTED_INPUTS
+        ), f"LLMGradAttribution does not support input type {type(inp)}"
+
+        if target is None:
+            # generate when None
+            assert hasattr(self.model, "generate") and callable(self.model.generate), (
+                "The model does not have recognizable generate function."
+                "Target must be given for attribution"
+            )
+
+            if not gen_args:
+                gen_args = DEFAULT_GEN_ARGS
+
+            model_inp = self._format_model_input(inp.to_model_input())
+            output_tokens = self.model.generate(model_inp, **gen_args)
+            target_tokens = output_tokens[0][model_inp.size(1) :]
+        else:
+            assert gen_args is None, "gen_args must be None when target is given"
+
+            if type(target) is str:
+                # exclude sos
+                target_tokens = self.tokenizer.encode(target)[1:]
+                target_tokens = torch.tensor(target_tokens)
+            elif type(target) is torch.Tensor:
+                target_tokens = target
+
+        attr_inp = inp.to_tensor().to(self.device)
+
+        attr_list = []
+        for cur_target_idx, _ in enumerate(target_tokens):
+            # attr in shape(batch_size, input+output_len, emb_dim)
+            attr = self.attr_method.attribute(
+                attr_inp,
+                additional_forward_args=(
+                    inp,
+                    target_tokens,
+                    cur_target_idx,
+                ),
+                **kwargs,
+            )
+            attr = cast(Tensor, attr)
+
+            # will have the attr for previous output tokens
+            # cut to shape(batch_size, inp_len, emb_dim)
+            if cur_target_idx:
+                attr = attr[:, :-cur_target_idx]
+
+            # the author of IG uses sum
+            # https://github.com/ankurtaly/Integrated-Gradients/blob/master/BertModel/bert_model_utils.py#L350
+            attr = attr.sum(-1)
+
+            attr_list.append(attr)
+
+        # assume inp batch only has one instance
+        # to shape(n_output_token, ...)
+        attr = torch.cat(attr_list, dim=0)
+
+        # grad attr method do not care the length of features in interpretable format
+        # it attributes to all the elements of the output of the specified layer
+        # so we need special handling for the inp type which don't care all the elements
+        if isinstance(inp, TextTokenInput) and inp.itp_mask is not None:
+            itp_mask = inp.itp_mask.to(self.device)
+            itp_mask = itp_mask.expand_as(attr)
+            attr = attr[itp_mask].view(attr.size(0), -1)
+
+        # for all the gradient methods we support in this class
+        # the seq attr is the sum of all the token attr if the attr_target is log_prob,
+        # shape(n_input_features)
+        seq_attr = attr.sum(0)
+
+        return LLMAttributionResult(
+            seq_attr,
+            attr,  # shape(n_output_token, n_input_features)
+            inp.values,
+            self.tokenizer.convert_ids_to_tokens(target_tokens),
+        )
diff --git a/captum/attr/_utils/interpretable_input.py b/captum/attr/_utils/interpretable_input.py
@@ -398,7 +398,7 @@ def __init__(
         self.skip_tokens = skip_tokens
 
         # features values, the tokens
-        self.values = tokenizer.convert_ids_to_tokens(self.itp_tensor[0])
+        self.values = tokenizer.convert_ids_to_tokens(self.itp_tensor[0].tolist())
         self.tokenizer = tokenizer
         self.n_itp_features = len(self.values)
 
@@ -409,6 +409,7 @@ def __init__(
         )
 
     def to_tensor(self) -> torch.Tensor:
+        # return the perturbation indicator as interpretable tensor instead of token ids
         return torch.ones_like(self.itp_tensor)
 
     def to_model_input(self, perturbed_tensor=None) -> torch.Tensor:
@@ -422,14 +423,14 @@ def to_model_input(self, perturbed_tensor=None) -> torch.Tensor:
         # perturb_per_eval or gradient based can expand the batch dim
         expand_shape = (perturbed_tensor.size(0), -1)
 
-        perturb_itp_tensor = self.itp_tensor.expand(*expand_shape).to(device)
+        perturb_itp_tensor = self.itp_tensor.expand(*expand_shape).clone().to(device)
         perturb_itp_tensor[perturb_mask] = self.baselines
 
         # if no iterpretable mask, the interpretable tensor is the input tensor
         if self.itp_mask is None:
             return perturb_itp_tensor
 
-        perturb_inp_tensor = self.inp_tensor.expand(*expand_shape).to(device)
+        perturb_inp_tensor = self.inp_tensor.expand(*expand_shape).clone().to(device)
         itp_mask = self.itp_mask.expand(*expand_shape).to(device)
 
         perturb_inp_tensor[itp_mask] = perturb_itp_tensor.view(-1)
diff --git a/setup.py b/setup.py
@@ -147,7 +147,7 @@ def get_package_files(root, subdirs):
         long_description=long_description,
         long_description_content_type="text/markdown",
         python_requires=">=3.6",
-        install_requires=["matplotlib", "numpy", "torch>=1.6"],
+        install_requires=["matplotlib", "numpy", "torch>=1.6", "tqdm"],
         packages=find_packages(exclude=("tests", "tests.*")),
         extras_require={
             "dev": DEV_REQUIRES,
diff --git a/tests/attr/test_llm_attr.py b/tests/attr/test_llm_attr.py

Original file line number	Diff line number	Diff line change
`@@ -467,6 +467,8 @@ def layer_forward_hook(`
`467`	`467`
`468`	`468`	`hooks.append(hook)`
`469`	`469`
	`470`	`+ # the inputs is an empty tuple`
	`471`	`+ # coz it is prepended into additional_forward_args`
`470`	`472`	`output = _run_forward(`
`471`	`473`	`self.forward_func, tuple(), target_ind, additional_forward_args`
`472`	`474`	`)`