docarray · samsja · Feb 8, 2023 · Jan 30, 2023 · Jan 30, 2023 · Jan 30, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -54,11 +54,12 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: 3.7
-      - name: Prepare enviroment
+      - name: Prepare environment
         run: |
           python -m pip install --upgrade pip
           python -m pip install poetry
           poetry install --without dev
+          poetry run pip install tensorflow==2.11.0
       - name: Test basic import
         run: poetry run python -c 'from docarray import DocumentArray, BaseDocument'
 
@@ -111,11 +112,12 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install poetry
           poetry install --all-extras
+          poetry run pip install tensorflow==2.11.0
 
       - name: Test
         id: test
         run: |
-          poetry run pytest ${{ matrix.test-path }}
+          poetry run pytest -m "not tensorflow" ${{ matrix.test-path }}
         timeout-minutes: 30
 #        env:
 #          JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}"
@@ -159,7 +161,7 @@ jobs:
       - name: Test
         id: test
         run: |
-          poetry run pytest ${{ matrix.test-path }}
+          poetry run pytest -m "not tensorflow" ${{ matrix.test-path }}
         timeout-minutes: 30
 
 
@@ -181,12 +183,40 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install poetry
           poetry install --all-extras
-          pip install protobuf==3.19.0 # we check that we support 3.19 
+          poetry run pip install protobuf==3.19.0 # we check that we support 3.19
+
+      - name: Test
+        id: test
+        run: |
+          poetry run pytest -m 'proto' tests
+        timeout-minutes: 30
+
+
+  docarray-test-tensorflow:
+    needs: [lint-ruff, check-black, import-test]
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.7]
+    steps:
+      - uses: actions/checkout@v2.5.0
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Prepare environment
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install poetry
+          poetry install --all-extras
+          poetry run pip install protobuf==3.19.0
+          poetry run pip install tensorflow==2.11.0
 
       - name: Test
         id: test
         run: |
-          poetry run pytest -k 'proto' tests
+          poetry run pytest -m 'tensorflow' tests
         timeout-minutes: 30
 
 

diff --git a/README.md b/README.md
@@ -209,22 +209,22 @@ class MyMultiModalModel(nn.Module):
         self.text_encoder = TextEncoder()
 
     def forward(self, text_1, text_2, image_1, image_2, audio_1, audio_2):
-        emnedding_text_1 = self.text_encoder(text_1)
-        emnedding_text_2 = self.text_encoder(text_2)
+        embedding_text_1 = self.text_encoder(text_1)
+        embedding_text_2 = self.text_encoder(text_2)
 
-        emnedding_image_1 = self.image_encoder(image_1)
-        emnedding_image_2 = self.image_encoder(image_2)
+        embedding_image_1 = self.image_encoder(image_1)
+        embedding_image_2 = self.image_encoder(image_2)
 
-        emnedding_audio_1 = self.image_encoder(audio_1)
-        emnedding_audio_2 = self.image_encoder(audio_2)
+        embedding_audio_1 = self.image_encoder(audio_1)
+        embedding_audio_2 = self.image_encoder(audio_2)
 
         return (
-            emnedding_text_1,
-            emnedding_text_2,
-            emnedding_image_1,
-            emnedding_image_2,
-            emnedding_audio_1,
-            emnedding_audio_2,
+            embedding_text_1,
+            embedding_text_2,
+            embedding_image_1,
+            embedding_image_2,
+            embedding_audio_1,
+            embedding_audio_2,
         )
 ```
 
@@ -258,14 +258,14 @@ class MyPodcastModel(nn.Module):
         self.image_encoder = ImageEncoder()
         self.text_encoder = TextEncoder()
 
-    def forward_podcast(da: DocumentArray[Podcast]) -> DocumentArray[Podcast]:
+    def forward_podcast(self, da: DocumentArray[Podcast]) -> DocumentArray[Podcast]:
         da.audio.embedding = self.audio_encoder(da.audio.tensor)
         da.text.embedding = self.text_encoder(da.text.tensor)
         da.image.embedding = self.image_encoder(da.image.tensor)
 
         return da
 
-    def forward(da: DocumentArray[PairPodcast]) -> DocumentArray[PairPodcast]:
+    def forward(self, da: DocumentArray[PairPodcast]) -> DocumentArray[PairPodcast]:
         da.left = self.forward_podcast(da.left)
         da.right = self.forward_podcast(da.right)
 
@@ -277,6 +277,49 @@ You instantly win in code readability and maintainability. And for the same pric
 schema definition (see below). Everything handles in a pythonic manner by relying on type hints.
 
 
+## Coming from TensorFlow
+
+Similar to the PyTorch approach, you can also use DocArray with TensorFlow to handle and represent multi-modal data inside your ML model.
+
+First off, to use DocArray with TensorFlow we first need to install it as follows:
+```
+pip install tensorflow==2.11.0
+pip install protobuf==3.19.0
+```
+
+Compared to using DocArray with PyTorch, there is one main difference when using it with TensorFlow:\
+While DocArray's `TorchTensor` is a subclass of `torch.Tensor`, this is not the case for the `TensorFlowTensor`: Due to technical limitations on `tf.Tensor`, docarray's `TensorFlowTensor` is not a subclass of `tf.Tensor` but instead stores a `tf.Tensor` in its `.tensor` attribute. 
+
+How does this effect you? Whenever you want to access the tensor data to e.g. do operations with it or hand it to your ML model, instead of handing over your `TensorFlowTensor` instance, you need to access its `.tensor` attribute.
+
+This would look like the following:
+
+```python
+from typing import Optional
+
+from docarray import DocumentArray, BaseDocument
+
+import tensorflow as tf
+
+
+class Podcast(BaseDocument):
+    audio_tensor: Optional[AudioTensorFlowTensor]
+    embedding: Optional[AudioTensorFlowTensor]
+
+
+class MyPodcastModel(tf.keras.Model):
+    def __init__(self):
+        super().__init__()
+        self.audio_encoder = AudioEncoder()
+
+    def call(self, inputs: DocumentArray[Podcast]) -> DocumentArray[Podcast]:
+        inputs.audio_tensor.embedding = self.audio_encoder(
+            inputs.audio_tensor.tensor
+        )  # access audio_tensor's .tensor attribute
+        return inputs
+```
+
+
 
 ## Coming from FastAPI
 

diff --git a/docarray/array/array_stacked.py b/docarray/array/array_stacked.py
@@ -27,14 +27,22 @@
     from pydantic.fields import ModelField
 
     from docarray.proto import DocumentArrayStackedProto
-    from docarray.typing import TorchTensor
-    from docarray.typing.tensor.abstract_tensor import AbstractTensor
 
 try:
     from docarray.typing import TorchTensor
 except ImportError:
     TorchTensor = None  # type: ignore
 
+try:
+    import tensorflow as tf  # type: ignore
+
+    from docarray.typing import TensorFlowTensor
+
+    tf_available = True
+except (ImportError, TypeError):
+    TensorFlowTensor = None  # type: ignore
+    tf_available = False
+
 T = TypeVar('T', bound='DocumentArrayStacked')
 IndexIterType = Union[slice, Iterable[int], Iterable[bool], None]
 
@@ -163,7 +171,26 @@ def _create_columns(
         tensor_columns: Dict[str, AbstractTensor] = dict()
 
         for field, type_ in column_schema.items():
-            if issubclass(type_, AbstractTensor):
+            if tf_available and isinstance(getattr(docs[0], field), TensorFlowTensor):
+                # tf.Tensor does not allow item assignment, therefore the optimized way
+                # of initializing an empty array and assigning values to it iteratively
+                # does not work here, therefore handle separately.
+                tf_stack = []
+                for i, doc in enumerate(docs):
+                    val = getattr(doc, field)
+                    if val is None:
+                        val = tensor_type.get_comp_backend().none_value()
+                    tf_stack.append(val.tensor)
+                    del val.tensor
+
+                stacked: tf.Tensor = tf.stack(tf_stack)
+                tensor_columns[field] = TensorFlowTensor(stacked)
+                for i, doc in enumerate(docs):
+                    val = getattr(doc, field)
+                    x = tensor_columns[field][i].tensor
+                    val.tensor = x
+
+            elif issubclass(type_, AbstractTensor):
                 tensor = getattr(docs[0], field)
                 column_shape = (
                     (len(docs), *tensor.shape) if tensor is not None else (len(docs),)
@@ -190,7 +217,8 @@ def _create_columns(
                     # We thus chose to convert the individual rank 0 tensors to rank 1
                     # This does mean that stacking rank 0 tensors will transform them
                     # to rank 1
-                    if tensor_columns[field].ndim == 1:
+                    tensor = tensor_columns[field]
+                    if tensor.get_comp_backend().n_dim(tensor) == 1:
                         setattr(doc, field, tensor_columns[field][i : i + 1])
                     else:
                         setattr(doc, field, tensor_columns[field][i])

diff --git a/docarray/computation/abstract_comp_backend.py b/docarray/computation/abstract_comp_backend.py
@@ -19,76 +19,77 @@ class AbstractComputationalBackend(ABC, typing.Generic[TTensor]):
     That way, DocArray can leverage native implementations from all frameworks.
     """
 
-    @staticmethod
+    @classmethod
     @abstractmethod
     def stack(
-        tensors: Union[List['TTensor'], Tuple['TTensor']], dim: int = 0
+        cls, tensors: Union[List['TTensor'], Tuple['TTensor']], dim: int = 0
     ) -> 'TTensor':
         """
         Stack a list of tensors along a new axis.
         """
         ...
 
-    @staticmethod
+    @classmethod
     @abstractmethod
-    def n_dim(array: 'TTensor') -> int:
+    def n_dim(cls, array: 'TTensor') -> int:
         """
         Get the number of the array dimensions.
         """
         ...
 
-    @staticmethod
+    @classmethod
     @abstractmethod
-    def squeeze(tensor: 'TTensor') -> 'TTensor':
+    def squeeze(cls, tensor: 'TTensor') -> 'TTensor':
         """
         Returns a tensor with all the dimensions of tensor of size 1 removed.
         """
         ...
 
-    @staticmethod
+    @classmethod
     @abstractmethod
-    def to_numpy(array: 'TTensor') -> 'np.ndarray':
+    def to_numpy(cls, array: 'TTensor') -> 'np.ndarray':
         """
         Convert array to np.ndarray.
         """
         ...
 
-    @staticmethod
+    @classmethod
     @abstractmethod
     def empty(
+        cls,
         shape: Tuple[int, ...],
         dtype: Optional[Any] = None,
         device: Optional[Any] = None,
     ) -> 'TTensor':
         ...
 
-    @staticmethod
+    @classmethod
     @abstractmethod
-    def none_value() -> typing.Any:
+    def none_value(cls) -> typing.Any:
         """Provide a compatible value that represents None in the Tensor Backend."""
         ...
 
-    @staticmethod
+    @classmethod
     @abstractmethod
-    def to_device(tensor: 'TTensor', device: str) -> 'TTensor':
+    def to_device(cls, tensor: 'TTensor', device: str) -> 'TTensor':
         """Move the tensor to the specified device."""
         ...
 
-    @staticmethod
+    @classmethod
     @abstractmethod
-    def device(tensor: 'TTensor') -> Optional[str]:
+    def device(cls, tensor: 'TTensor') -> Optional[str]:
         """Return device on which the tensor is allocated."""
         ...
 
-    @staticmethod
+    @classmethod
     @abstractmethod
-    def shape(tensor: 'TTensor') -> Tuple[int, ...]:
+    def shape(cls, tensor: 'TTensor') -> Tuple[int, ...]:
         """Get shape of tensor"""
         ...
 
-    @staticmethod
+    @classmethod
     @abstractmethod
-    def reshape(tensor: 'TTensor', shape: Tuple[int, ...]) -> 'TTensor':
+    def reshape(cls, tensor: 'TTensor', shape: Tuple[int, ...]) -> 'TTensor':
         """
         Gives a new shape to tensor without changing its data.
 
@@ -99,9 +100,9 @@ def reshape(tensor: 'TTensor', shape: Tuple[int, ...]) -> 'TTensor':
         """
         ...
 
-    @staticmethod
+    @classmethod
     @abstractmethod
-    def detach(tensor: 'TTensor') -> 'TTensor':
+    def detach(cls, tensor: 'TTensor') -> 'TTensor':
         """
         Returns the tensor detached from its current graph.
 
@@ -110,21 +111,22 @@ def detach(tensor: 'TTensor') -> 'TTensor':
         """
         ...
 
-    @staticmethod
+    @classmethod
     @abstractmethod
-    def dtype(tensor: 'TTensor') -> Any:
+    def dtype(cls, tensor: 'TTensor') -> Any:
         """Get the data type of the tensor."""
         ...
 
-    @staticmethod
+    @classmethod
     @abstractmethod
-    def isnan(tensor: 'TTensor') -> 'TTensor':
+    def isnan(cls, tensor: 'TTensor') -> 'TTensor':
         """Check element-wise for nan and return result as a boolean array"""
         ...
 
-    @staticmethod
+    @classmethod
     @abstractmethod
     def minmax_normalize(
+        cls,
         tensor: 'TTensor',
         t_range: Tuple = (0, 1),
         x_range: Optional[Tuple] = None,