Module deepposekit.models.StackedDenseNet
Expand source code
# -*- coding: utf-8 -*-
# Copyright 2018-2019 Jacob M. Graving <jgraving@gmail.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import BatchNormalization
import deepposekit.utils.image as image_utils
from deepposekit.models.engine import BaseModel
from deepposekit.models.layers.util import ImageNormalization
from deepposekit.models.layers.deeplabcut import ImageNetPreprocess
from deepposekit.models.layers.densenet import (
FrontEnd,
ImageNetFrontEnd,
DenseNet,
OutputChannels,
Concatenate,
)
class StackedDenseNet(BaseModel):
def __init__(
self,
train_generator,
n_stacks=1,
n_transitions=-1,
growth_rate=48,
bottleneck_factor=1,
compression_factor=0.5,
pretrained=False,
subpixel=True,
**kwargs
):
"""
Define a Stacked DenseNet model from Graving et al. [1]
for pose estimation.
This model combines elements from [2-5]
See `References` for details on the model architecture.
Parameters
----------
train_generator : class deepposekit.io.TrainingGenerator
A deepposekit.io.TrainingGenerator class for generating
images and confidence maps.
n_stacks : int, default = 1
The number of encoder-decoder networks to stack
with intermediate supervision between stacks
n_transitions : int, default = -1
The number of transition layers (downsampling and upsampling)
in each encoder-decoder stack. If value is <0
the number of transitions will be automatically set
based on image size as the maximum number of possible
transitions minus n_transitions plus 1, or:
n_transitions = max_transitions - n_transitions + 1.
The default is -1, which uses the maximum number of
transitions possible.
growth_rate : int, default = 48
The number of channels to output from each convolutional
block.
bottleneck_factor : int, default = 1
The factor for determining the number of input channels
to 3x3 convolutional layer in each convolutional block.
Inputs are first passed through a 1x1 convolutional layer to
reduce the number of channels to:
growth_rate * bottleneck_factor
compression_factor : int, default = 0.5
The factor for determining the number of channels passed
through a transition layer (downsampling or upsampling).
Inputs are first passed through a 1x1 convolutional layer
to reduce the number of channels to
n_input_channels * compression_factor
pretrained : bool, default = False
Whether to use an encoder that is pretrained on ImageNet
subpixel: bool, default = True
Whether to use subpixel maxima for calculating
keypoint coordinates in the prediction model.
Attributes
-------
train_model: keras.Model
A model for training the network to produce confidence maps with
one input layer for images and `n_outputs` output layers for training
with intermediate supervision
predict_model: keras.Model
A model for predicting keypoint coordinates with one input and one output
using with Maxima2D or SubpixelMaxima2D layers at the output of the network.
Both of these models share the same computational graph, so training train_model
updates the weights of predict_model
References
----------
1. Graving, J.M., Chae, D., Naik, H., Li, L., Koger, B., Costelloe, B.R.,
Couzin, I.D. (2019) DeepPoseKit, a software toolkit for fast and robust
animal pose estimation using deep learning. eLife, 8, e47994
2. Jégou, S., Drozdzal, M., Vazquez, D., Romero, A., & Bengio, Y. (2017).
The one hundred layers tiramisu: Fully convolutional densenets for
semantic segmentation. In Computer Vision and Pattern Recognition
Workshops (CVPRW), 2017 IEEE Conference on (pp. 1175-1183). IEEE.
3. Newell, A., Yang, K., & Deng, J. (2016). Stacked hourglass networks
for human pose estimation. In European Conference on Computer
Vision (pp. 483-499). Springer, Cham.
4. Huang, G., Liu, Z., Weinberger, K. Q., & van der Maaten, L. (2017).
Densely connected convolutional networks. In Proceedings of the IEEE
conference on computer vision and pattern recognition
(Vol. 1, No. 2, p. 3).
5. Klambauer, G., Unterthiner, T., Mayr, A., & Hochreiter, S. (2017).
Self-normalizing neural networks. In Advances in Neural Information
Processing Systems (pp. 972-981).
"""
self.n_stacks = n_stacks
self.growth_rate = growth_rate
self.bottleneck_factor = bottleneck_factor
self.compression_factor = compression_factor
self.n_transitions = n_transitions
self.pretrained = pretrained
super(StackedDenseNet, self).__init__(train_generator, subpixel, **kwargs)
def __init_model__(self):
max_transitions = np.min(
[
image_utils.n_downsample(self.train_generator.height),
image_utils.n_downsample(self.train_generator.width),
]
)
n_transitions = self.n_transitions
if isinstance(n_transitions, (int, np.integer)):
if n_transitions == 0:
raise ValueError("n_transitions cannot equal zero")
if n_transitions < 0:
n_transitions += 1
n_transitions = max_transitions - np.abs(n_transitions)
self.n_transitions = n_transitions
elif 0 < n_transitions <= max_transitions:
self.n_transitions = n_transitions
else:
raise ValueError(
"n_transitions must be in range {0} "
"< n_transitions <= "
"{1}".format(-max_transitions + 1, max_transitions)
)
else:
raise TypeError(
"n_transitions must be integer in range "
"{0} < n_transitions <= "
"{1}".format(-max_transitions + 1, max_transitions)
)
if self.train_generator.downsample_factor < 2:
raise ValueError(
"StackedDenseNet is only compatible with `downsample_factor` >= 2."
"Adjust the TrainingGenerator or choose a different model."
)
if n_transitions <= self.train_generator.downsample_factor:
raise ValueError(
"`n_transitions` <= `downsample_factor`. Increase `n_transitions` or decrease `downsample_factor`."
" If `n_transitions` is -1 (the default), check that your image resolutions can be repeatedly downsampled (are divisible by 2 repeatedly)."
)
if self.pretrained:
if self.input_shape[-1] is 1:
inputs = Concatenate()([self.inputs] * 3)
input_shape = self.input_shape[:-1] + (3,)
else:
inputs = self.inputs
input_shape = self.input_shape
normalized = ImageNetPreprocess("densenet121")(inputs)
front_outputs = ImageNetFrontEnd(
input_shape=input_shape,
n_downsample=self.train_generator.downsample_factor,
compression_factor=self.compression_factor,
)(normalized)
else:
normalized = ImageNormalization()(self.inputs)
front_outputs = FrontEnd(
growth_rate=self.growth_rate,
n_downsample=self.train_generator.downsample_factor,
compression_factor=self.compression_factor,
bottleneck_factor=self.bottleneck_factor,
)(normalized)
n_downsample = self.n_transitions - self.train_generator.downsample_factor
outputs = front_outputs
model_outputs = OutputChannels(
self.train_generator.n_output_channels, name="output_0"
)(outputs)
model_outputs_list = [model_outputs]
outputs.append(BatchNormalization()(model_outputs))
for idx in range(self.n_stacks):
outputs = DenseNet(
growth_rate=self.growth_rate,
n_downsample=self.n_transitions
- self.train_generator.downsample_factor,
downsample_factor=self.train_generator.downsample_factor,
compression_factor=self.compression_factor,
bottleneck_factor=self.bottleneck_factor,
)(outputs)
outputs.append(Concatenate()(front_outputs))
outputs.append(BatchNormalization()(model_outputs))
model_outputs = OutputChannels(
self.train_generator.n_output_channels, name="output_" + str(idx + 1)
)(outputs)
model_outputs_list.append(model_outputs)
self.train_model = Model(
self.inputs, model_outputs_list, name=self.__class__.__name__
)
def get_config(self):
config = {
"name": self.__class__.__name__,
"n_stacks": self.n_stacks,
"n_transitions": self.n_transitions,
"growth_rate": self.growth_rate,
"bottleneck_factor": self.bottleneck_factor,
"compression_factor": self.compression_factor,
"pretrained": self.pretrained,
"subpixel": self.subpixel,
}
base_config = super(StackedDenseNet, self).get_config()
return dict(list(config.items()) + list(base_config.items()))
Classes
class StackedDenseNet (train_generator, n_stacks=1, n_transitions=-1, growth_rate=48, bottleneck_factor=1, compression_factor=0.5, pretrained=False, subpixel=True, **kwargs)
-
Define a Stacked DenseNet model from Graving et al. [1] for pose estimation. This model combines elements from [2-5] See
References
for details on the model architecture.Parameters
train_generator
:class
deepposekit.io.TrainingGenerator
- A deepposekit.io.TrainingGenerator class for generating images and confidence maps.
n_stacks
:int
, default =1
- The number of encoder-decoder networks to stack with intermediate supervision between stacks
n_transitions
:int
, default = -1
- The number of transition layers (downsampling and upsampling) in each encoder-decoder stack. If value is <0 the number of transitions will be automatically set based on image size as the maximum number of possible transitions minus n_transitions plus 1, or: n_transitions = max_transitions - n_transitions + 1. The default is -1, which uses the maximum number of transitions possible.
growth_rate
:int
, default =48
- The number of channels to output from each convolutional block.
bottleneck_factor
:int
, default =1
- The factor for determining the number of input channels to 3x3 convolutional layer in each convolutional block. Inputs are first passed through a 1x1 convolutional layer to reduce the number of channels to: growth_rate * bottleneck_factor
compression_factor
:int
, default =0.5
- The factor for determining the number of channels passed through a transition layer (downsampling or upsampling). Inputs are first passed through a 1x1 convolutional layer to reduce the number of channels to n_input_channels * compression_factor
pretrained
:bool
, default =False
- Whether to use an encoder that is pretrained on ImageNet
subpixel
:bool
, default =True
- Whether to use subpixel maxima for calculating keypoint coordinates in the prediction model.
Attributes
train_model
:keras.Model
- A model for training the network to produce confidence maps with
one input layer for images and
n_outputs
output layers for training with intermediate supervision predict_model
:keras.Model
- A model for predicting keypoint coordinates with one input and one output using with Maxima2D or SubpixelMaxima2D layers at the output of the network.
Both of these models share the same computational graph, so training train_model updates the weights of predict_model
References
- Graving, J.M., Chae, D., Naik, H., Li, L., Koger, B., Costelloe, B.R., Couzin, I.D. (2019) DeepPoseKit, a software toolkit for fast and robust animal pose estimation using deep learning. eLife, 8, e47994
- Jégou, S., Drozdzal, M., Vazquez, D., Romero, A., & Bengio, Y. (2017). The one hundred layers tiramisu: Fully convolutional densenets for semantic segmentation. In Computer Vision and Pattern Recognition Workshops (CVPRW), 2017 IEEE Conference on (pp. 1175-1183). IEEE.
- Newell, A., Yang, K., & Deng, J. (2016). Stacked hourglass networks for human pose estimation. In European Conference on Computer Vision (pp. 483-499). Springer, Cham.
- Huang, G., Liu, Z., Weinberger, K. Q., & van der Maaten, L. (2017). Densely connected convolutional networks. In Proceedings of the IEEE conference on computer vision and pattern recognition (Vol. 1, No. 2, p. 3).
- Klambauer, G., Unterthiner, T., Mayr, A., & Hochreiter, S. (2017). Self-normalizing neural networks. In Advances in Neural Information Processing Systems (pp. 972-981).
Expand source code
class StackedDenseNet(BaseModel): def __init__( self, train_generator, n_stacks=1, n_transitions=-1, growth_rate=48, bottleneck_factor=1, compression_factor=0.5, pretrained=False, subpixel=True, **kwargs ): """ Define a Stacked DenseNet model from Graving et al. [1] for pose estimation. This model combines elements from [2-5] See `References` for details on the model architecture. Parameters ---------- train_generator : class deepposekit.io.TrainingGenerator A deepposekit.io.TrainingGenerator class for generating images and confidence maps. n_stacks : int, default = 1 The number of encoder-decoder networks to stack with intermediate supervision between stacks n_transitions : int, default = -1 The number of transition layers (downsampling and upsampling) in each encoder-decoder stack. If value is <0 the number of transitions will be automatically set based on image size as the maximum number of possible transitions minus n_transitions plus 1, or: n_transitions = max_transitions - n_transitions + 1. The default is -1, which uses the maximum number of transitions possible. growth_rate : int, default = 48 The number of channels to output from each convolutional block. bottleneck_factor : int, default = 1 The factor for determining the number of input channels to 3x3 convolutional layer in each convolutional block. Inputs are first passed through a 1x1 convolutional layer to reduce the number of channels to: growth_rate * bottleneck_factor compression_factor : int, default = 0.5 The factor for determining the number of channels passed through a transition layer (downsampling or upsampling). Inputs are first passed through a 1x1 convolutional layer to reduce the number of channels to n_input_channels * compression_factor pretrained : bool, default = False Whether to use an encoder that is pretrained on ImageNet subpixel: bool, default = True Whether to use subpixel maxima for calculating keypoint coordinates in the prediction model. Attributes ------- train_model: keras.Model A model for training the network to produce confidence maps with one input layer for images and `n_outputs` output layers for training with intermediate supervision predict_model: keras.Model A model for predicting keypoint coordinates with one input and one output using with Maxima2D or SubpixelMaxima2D layers at the output of the network. Both of these models share the same computational graph, so training train_model updates the weights of predict_model References ---------- 1. Graving, J.M., Chae, D., Naik, H., Li, L., Koger, B., Costelloe, B.R., Couzin, I.D. (2019) DeepPoseKit, a software toolkit for fast and robust animal pose estimation using deep learning. eLife, 8, e47994 2. Jégou, S., Drozdzal, M., Vazquez, D., Romero, A., & Bengio, Y. (2017). The one hundred layers tiramisu: Fully convolutional densenets for semantic segmentation. In Computer Vision and Pattern Recognition Workshops (CVPRW), 2017 IEEE Conference on (pp. 1175-1183). IEEE. 3. Newell, A., Yang, K., & Deng, J. (2016). Stacked hourglass networks for human pose estimation. In European Conference on Computer Vision (pp. 483-499). Springer, Cham. 4. Huang, G., Liu, Z., Weinberger, K. Q., & van der Maaten, L. (2017). Densely connected convolutional networks. In Proceedings of the IEEE conference on computer vision and pattern recognition (Vol. 1, No. 2, p. 3). 5. Klambauer, G., Unterthiner, T., Mayr, A., & Hochreiter, S. (2017). Self-normalizing neural networks. In Advances in Neural Information Processing Systems (pp. 972-981). """ self.n_stacks = n_stacks self.growth_rate = growth_rate self.bottleneck_factor = bottleneck_factor self.compression_factor = compression_factor self.n_transitions = n_transitions self.pretrained = pretrained super(StackedDenseNet, self).__init__(train_generator, subpixel, **kwargs) def __init_model__(self): max_transitions = np.min( [ image_utils.n_downsample(self.train_generator.height), image_utils.n_downsample(self.train_generator.width), ] ) n_transitions = self.n_transitions if isinstance(n_transitions, (int, np.integer)): if n_transitions == 0: raise ValueError("n_transitions cannot equal zero") if n_transitions < 0: n_transitions += 1 n_transitions = max_transitions - np.abs(n_transitions) self.n_transitions = n_transitions elif 0 < n_transitions <= max_transitions: self.n_transitions = n_transitions else: raise ValueError( "n_transitions must be in range {0} " "< n_transitions <= " "{1}".format(-max_transitions + 1, max_transitions) ) else: raise TypeError( "n_transitions must be integer in range " "{0} < n_transitions <= " "{1}".format(-max_transitions + 1, max_transitions) ) if self.train_generator.downsample_factor < 2: raise ValueError( "StackedDenseNet is only compatible with `downsample_factor` >= 2." "Adjust the TrainingGenerator or choose a different model." ) if n_transitions <= self.train_generator.downsample_factor: raise ValueError( "`n_transitions` <= `downsample_factor`. Increase `n_transitions` or decrease `downsample_factor`." " If `n_transitions` is -1 (the default), check that your image resolutions can be repeatedly downsampled (are divisible by 2 repeatedly)." ) if self.pretrained: if self.input_shape[-1] is 1: inputs = Concatenate()([self.inputs] * 3) input_shape = self.input_shape[:-1] + (3,) else: inputs = self.inputs input_shape = self.input_shape normalized = ImageNetPreprocess("densenet121")(inputs) front_outputs = ImageNetFrontEnd( input_shape=input_shape, n_downsample=self.train_generator.downsample_factor, compression_factor=self.compression_factor, )(normalized) else: normalized = ImageNormalization()(self.inputs) front_outputs = FrontEnd( growth_rate=self.growth_rate, n_downsample=self.train_generator.downsample_factor, compression_factor=self.compression_factor, bottleneck_factor=self.bottleneck_factor, )(normalized) n_downsample = self.n_transitions - self.train_generator.downsample_factor outputs = front_outputs model_outputs = OutputChannels( self.train_generator.n_output_channels, name="output_0" )(outputs) model_outputs_list = [model_outputs] outputs.append(BatchNormalization()(model_outputs)) for idx in range(self.n_stacks): outputs = DenseNet( growth_rate=self.growth_rate, n_downsample=self.n_transitions - self.train_generator.downsample_factor, downsample_factor=self.train_generator.downsample_factor, compression_factor=self.compression_factor, bottleneck_factor=self.bottleneck_factor, )(outputs) outputs.append(Concatenate()(front_outputs)) outputs.append(BatchNormalization()(model_outputs)) model_outputs = OutputChannels( self.train_generator.n_output_channels, name="output_" + str(idx + 1) )(outputs) model_outputs_list.append(model_outputs) self.train_model = Model( self.inputs, model_outputs_list, name=self.__class__.__name__ ) def get_config(self): config = { "name": self.__class__.__name__, "n_stacks": self.n_stacks, "n_transitions": self.n_transitions, "growth_rate": self.growth_rate, "bottleneck_factor": self.bottleneck_factor, "compression_factor": self.compression_factor, "pretrained": self.pretrained, "subpixel": self.subpixel, } base_config = super(StackedDenseNet, self).get_config() return dict(list(config.items()) + list(base_config.items()))
Ancestors
Methods
def get_config(self)
-
Expand source code
def get_config(self): config = { "name": self.__class__.__name__, "n_stacks": self.n_stacks, "n_transitions": self.n_transitions, "growth_rate": self.growth_rate, "bottleneck_factor": self.bottleneck_factor, "compression_factor": self.compression_factor, "pretrained": self.pretrained, "subpixel": self.subpixel, } base_config = super(StackedDenseNet, self).get_config() return dict(list(config.items()) + list(base_config.items()))
Inherited members