caffemodel2pytorch
This converter can be useful for porting Caffe code and layers to PyTorch. Features:
dump caffemodel weights to hdf5, npy, pt and json formats
load Caffe models and use them from PyTorch
mock PyCaffe API to allow for smooth porting of Caffe-using code (drop-in script for OICR for changing backend in train/eval to PyTorch is below):
Net, Blob, SGDSolver
wrapping Caffe's Python layers (see the OICR example)
example of ROI pooling in PyTorch without manual CUDA code compilation (see the OICR example)
The layer support isn't as complete as in https://github.com/marvis/pytorch-caffe. Currently it supports the following Caffe layers:
convolution (num_output, kernel_size, stride, pad, dilation; constant and gaussian weight/bias fillers)
inner_product (num_output; constant and gaussian weight/bias fillers)
max / avg pooling (kernel_size, stride, pad)
relu
dropout (dropout_ratio)
eltwise (prod, sum, max)
softmax (axis)
local response norm (local_size, alpha, beta)
Dependencies: protobuf with Python bindings, including protoc
binary in PATH
.
PRs to enable other layers or layer params are very welcome (see the definition of the modules
dictionary in the code)!
License is MIT.
# prototxt and caffemodel from https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-md# dumps to PT by default to VGG_ILSVRC_16_layers.caffemodel.ptpython -m caffemodel2pytorch VGG_ILSVRC_16_layers.caffemodel# dumps to HDF5 converted.h5python -m caffemodel2pytorch VGG_ILSVRC_16_layers.caffemodel -o converted.h5
# load dumped VGG16 in PyTorchimport collections, torch, torchvision, numpy, h5py model = torchvision.models.vgg16() model.features = torch.nn.Sequential(collections.OrderedDict(zip(['conv1_1', 'relu1_1', 'conv1_2', 'relu1_2', 'pool1', 'conv2_1', 'relu2_1', 'conv2_2', 'relu2_2', 'pool2', 'conv3_1', 'relu3_1', 'conv3_2', 'relu3_2', 'conv3_3', 'relu3_3', 'pool3', 'conv4_1', 'relu4_1', 'conv4_2', 'relu4_2', 'conv4_3', 'relu4_3', 'pool4', 'conv5_1', 'relu5_1', 'conv5_2', 'relu5_2', 'conv5_3', 'relu5_3', 'pool5'], model.features))) model.classifier = torch.nn.Sequential(collections.OrderedDict(zip(['fc6', 'relu6', 'drop6', 'fc7', 'relu7', 'drop7', 'fc8'], model.classifier))) state_dict = h5py.File('converted.h5', 'r') # torch.load('VGG_ILSVRC_16_layers.caffemodel.pt')model.load_state_dict({l : torch.from_numpy(numpy.array(v)).view_as(p) for k, v in state_dict.items() for l, p in model.named_parameters() if k in l})
import torchimport caffemodel2pytorch model = caffemodel2pytorch.Net( prototxt = 'VGG_ILSVRC_16_layers_deploy.prototxt', weights = 'VGG_ILSVRC_16_layers.caffemodel', caffe_proto = 'https://raw.githubusercontent.com/BVLC/caffe/master/src/caffe/proto/caffe.proto') model.cuda() model.eval() torch.set_grad_enabled(False)# make sure to have right procedure of image normalization and channel reorderingimage = torch.autograd.Variable(torch.Tensor(8, 3, 224, 224).cuda())# outputs dict of PyTorch Variables# in this example the dict contains the only key "prob"#output_dict = model(data = image)# you can remove unneeded layers:del model.probdel model.fc8# a single input variable is interpreted as an input blob named "data"# in this example the dict contains the only key "fc7"output_dict = model(image)
import numpy as npimport caffemodel2pytorch as caffe caffe.set_mode_gpu() caffe.set_device(0)# === LOADING AND USING THE NET IN EVAL MODE ===net = caffe.Net('VGG_ILSVRC_16_layers_deploy.prototxt', caffe.TEST, weights = 'VGG_ILSVRC_16_layers.caffemodel')# outputs a dict of NumPy arrays, data layer is sidesteppedblobs_out = net.forward(data = np.zeros((8, 3, 224, 224), dtype = np.float32))# access the last layerlayer = net.layers[-1]# converts and provides the output as NumPy arraynumpy_array = net.blobs['conv1_1'].data# access the loss weightsloss_weights = net.blob_loss_weights# === BASIC OPTIMIZER ===# this example uses paths from https://github.com/ppengtang/oicr# create an SGD solver, loads the net in train mode# it knows about base_lr, weight_decay, momentum, lr_mult, decay_mult, iter_size, lr policy step, step_size, gamma# it finds train.prototxt from the solver.prototxt's train_net or net parameterssolver = caffe.SGDSolver('oicr/models/VGG16/solver.prototxt')# load pretrained weightssolver.net.copy_from('oicr/data/imagenet_models/VGG16.v2.caffemodel')# runs one iteration of forward, backward, optimization; returns a float loss value# data layer must be registered or inputs must be provided as keyword argumentsloss = solver.step(1)
Place caffe_pytorch_oicr.py
and caffemodel2pytorch.py
in the root oicr
directory. To use the PyTorch backend in testing and in training, put a line import caffe_pytorch_oicr
at the very top (before import _init_paths
) in tools/test_net.py
and tools/train_net.py
respectively. It requires PyTorch and CuPy (for on-the-fly CUDA kernel compilation).
# caffe_pytorch_oicr.pyimport collectionsimport torchimport torch.nn.functional as Fimport cupyimport caffemodel2pytorch caffemodel2pytorch.initialize('./caffe-oicr/src/caffe/proto/caffe.proto') # needs to be called explicitly for these porting scenarios to enable caffe.proto.caffe_pb2 variablecaffemodel2pytorch.set_mode_gpu() caffemodel2pytorch.modules['GlobalSumPooling'] = lambda param: lambda pred: pred.sum(dim = 0, keepdim = True) caffemodel2pytorch.modules['MulticlassCrossEntropyLoss'] = lambda param: lambda pred, labels, eps = 1e-6: F.binary_cross_entropy(pred.clamp(eps, 1 - eps), labels) caffemodel2pytorch.modules['data'] = lambda param: __import__('roi_data_layer.layer').layer.RoIDataLayer() # wrapping a PyCaffe layercaffemodel2pytorch.modules['OICRLayer'] = lambda param: OICRLayer # wrapping a PyTorch functioncaffemodel2pytorch.modules['WeightedSoftmaxWithLoss'] = lambda param: WeightedSoftmaxWithLoss caffemodel2pytorch.modules['ReLU'] = lambda param: torch.nn.ReLU(inplace = True) # wrapping a PyTorch modulecaffemodel2pytorch.modules['ROIPooling'] = lambda param: lambda input, rois: RoiPooling(param['pooled_h'], param['pooled_w'], param['spatial_scale'])(input, rois) # wrapping a PyTorch autograd functiondef WeightedSoftmaxWithLoss(prob, labels_ic, cls_loss_weights, eps = 1e-12): loss = -cls_loss_weights * F.log_softmax(prob, dim = -1).gather(-1, labels_ic.long().unsqueeze(-1)).squeeze(-1) valid_sum = cls_loss_weights.gt(eps).float().sum() return loss.sum() / (loss.numel() if valid_sum == 0 else valid_sum)def OICRLayer(boxes, cls_prob, im_labels, cfg_TRAIN_FG_THRESH = 0.5): cls_prob = (cls_prob if cls_prob.size(-1) == im_labels.size(-1) else cls_prob[..., 1:]).clone() boxes = boxes[..., 1:] gt_boxes, gt_classes, gt_scores = [], [], [] for i in im_labels.eq(1).nonzero()[:, 1]: max_index = int(cls_prob[:, i].max(dim = 0)[1]) gt_boxes.append(boxes[max_index]) gt_classes.append(int(i) + 1) gt_scores.append(float(cls_prob[max_index, i])) cls_prob[max_index] = 0 max_overlaps, gt_assignment = overlap(boxes, torch.stack(gt_boxes)).max(dim = 1) return gt_assignment.new(gt_classes)[gt_assignment] * (max_overlaps > cfg_TRAIN_FG_THRESH).type_as(gt_assignment), max_overlaps.new(gt_scores)[gt_assignment]class RoiPooling(torch.autograd.Function): CUDA_NUM_THREADS = 1024 GET_BLOCKS = staticmethod(lambda N: (N + RoiPooling.CUDA_NUM_THREADS - 1) // RoiPooling.CUDA_NUM_THREADS) Stream = collections.namedtuple('Stream', ['ptr']) kernel_forward = b''' #define FLT_MAX 340282346638528859811704183484516925440.0f typedef float Dtype; #define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) extern "C" __global__ void ROIPoolForward(const int nthreads, const Dtype* bottom_data, const Dtype spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const Dtype* bottom_rois, Dtype* top_data, int* argmax_data) { CUDA_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; bottom_rois += n * 5; int roi_batch_ind = bottom_rois[0]; int roi_start_w = round(bottom_rois[1] * spatial_scale); int roi_start_h = round(bottom_rois[2] * spatial_scale); int roi_end_w = round(bottom_rois[3] * spatial_scale); int roi_end_h = round(bottom_rois[4] * spatial_scale); // Force malformed ROIs to be 1x1 int roi_width = max(roi_end_w - roi_start_w + 1, 1); int roi_height = max(roi_end_h - roi_start_h + 1, 1); Dtype bin_size_h = static_cast<Dtype>(roi_height) / static_cast<Dtype>(pooled_height); Dtype bin_size_w = static_cast<Dtype>(roi_width) / static_cast<Dtype>(pooled_width); int hstart = static_cast<int>(floor(static_cast<Dtype>(ph) * bin_size_h)); int wstart = static_cast<int>(floor(static_cast<Dtype>(pw) * bin_size_w)); int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1) * bin_size_h)); int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1) * bin_size_w)); // Add roi offsets and clip to input boundaries hstart = min(max(hstart + roi_start_h, 0), height); hend = min(max(hend + roi_start_h, 0), height); wstart = min(max(wstart + roi_start_w, 0), width); wend = min(max(wend + roi_start_w, 0), width); bool is_empty = (hend <= hstart) || (wend <= wstart); // Define an empty pooling region to be zero Dtype maxval = is_empty ? 0 : -FLT_MAX; // If nothing is pooled, argmax = -1 causes nothing to be backprop'd int maxidx = -1; bottom_data += (roi_batch_ind * channels + c) * height * width; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { int bottom_index = h * width + w; if (bottom_data[bottom_index] > maxval) { maxval = bottom_data[bottom_index]; maxidx = bottom_index; } } } top_data[index] = maxval; argmax_data[index] = maxidx; } } ''' kernel_backward = b''' typedef float Dtype; #define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) extern "C" __global__ void ROIPoolBackward(const int nthreads, const Dtype* top_diff, const int* argmax_data, const int num_rois, const Dtype spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, Dtype* bottom_diff, const Dtype* bottom_rois) { CUDA_KERNEL_LOOP(index, nthreads) { // (n, c, h, w) coords in bottom data int w = index % width; int h = (index / width) % height; int c = (index / width / height) % channels; int n = index / width / height / channels; Dtype gradient = 0; // Accumulate gradient over all ROIs that pooled this element for (int roi_n = 0; roi_n < num_rois; ++roi_n) { const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5; int roi_batch_ind = offset_bottom_rois[0]; // Skip if ROI's batch index doesn't match n if (n != roi_batch_ind) { continue; } int roi_start_w = round(offset_bottom_rois[1] * spatial_scale); int roi_start_h = round(offset_bottom_rois[2] * spatial_scale); int roi_end_w = round(offset_bottom_rois[3] * spatial_scale); int roi_end_h = round(offset_bottom_rois[4] * spatial_scale); // Skip if ROI doesn't include (h, w) const bool in_roi = (w >= roi_start_w && w <= roi_end_w && h >= roi_start_h && h <= roi_end_h); if (!in_roi) { continue; } int offset = (roi_n * channels + c) * pooled_height * pooled_width; const Dtype* offset_top_diff = top_diff + offset; const int* offset_argmax_data = argmax_data + offset; // Compute feasible set of pooled units that could have pooled // this bottom unit // Force malformed ROIs to be 1x1 int roi_width = max(roi_end_w - roi_start_w + 1, 1); int roi_height = max(roi_end_h - roi_start_h + 1, 1); Dtype bin_size_h = static_cast<Dtype>(roi_height) / static_cast<Dtype>(pooled_height); Dtype bin_size_w = static_cast<Dtype>(roi_width) / static_cast<Dtype>(pooled_width); int phstart = floor(static_cast<Dtype>(h - roi_start_h) / bin_size_h); int phend = ceil(static_cast<Dtype>(h - roi_start_h + 1) / bin_size_h); int pwstart = floor(static_cast<Dtype>(w - roi_start_w) / bin_size_w); int pwend = ceil(static_cast<Dtype>(w - roi_start_w + 1) / bin_size_w); phstart = min(max(phstart, 0), pooled_height); phend = min(max(phend, 0), pooled_height); pwstart = min(max(pwstart, 0), pooled_width); pwend = min(max(pwend, 0), pooled_width); for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { if (offset_argmax_data[ph * pooled_width + pw] == (h * width + w)) { gradient += offset_top_diff[ph * pooled_width + pw]; } } } } bottom_diff[index] = gradient; } } ''' cupy_init = cupy.array([]) compiled_forward = cupy.cuda.compiler.compile_with_cache(kernel_forward).get_function('ROIPoolForward') compiled_backward = cupy.cuda.compiler.compile_with_cache(kernel_backward).get_function('ROIPoolBackward') def __init__(self, pooled_height, pooled_width, spatial_scale): self.pooled_height = pooled_height self.pooled_width = pooled_width self.spatial_scale = spatial_scale def forward(self, images, rois): output = torch.cuda.FloatTensor(len(rois), images.size(1) * self.pooled_height * self.pooled_width) self.argmax = torch.cuda.IntTensor(output.size()).fill_(-1) self.input_size = images.size() self.save_for_backward(rois) RoiPooling.compiled_forward(grid = (RoiPooling.GET_BLOCKS(output.numel()), 1, 1), block = (RoiPooling.CUDA_NUM_THREADS, 1, 1), args=[ output.numel(), images.data_ptr(), cupy.float32(self.spatial_scale), self.input_size[-3], self.input_size[-2], self.input_size[-1], self.pooled_height, self.pooled_width, rois.data_ptr(), output.data_ptr(), self.argmax.data_ptr() ], stream=RoiPooling.Stream(ptr=torch.cuda.current_stream().cuda_stream)) return output def backward(self, grad_output): rois, = self.saved_tensors grad_input = torch.cuda.FloatTensor(*self.input_size).zero_() RoiPooling.compiled_backward(grid = (RoiPooling.GET_BLOCKS(grad_input.numel()), 1, 1), block = (RoiPooling.CUDA_NUM_THREADS, 1, 1), args=[ grad_input.numel(), grad_output.data_ptr(), self.argmax.data_ptr(), len(rois), cupy.float32(self.spatial_scale), self.input_size[-3], self.input_size[-2], self.input_size[-1], self.pooled_height, self.pooled_width, grad_input.data_ptr(), rois.data_ptr() ], stream=RoiPooling.Stream(ptr=torch.cuda.current_stream().cuda_stream)) return grad_input, None def overlap(box1, box2): b1, b2 = box1.t().contiguous(), box2.t().contiguous() xx1 = torch.max(b1[0].unsqueeze(1), b2[0].unsqueeze(0)) yy1 = torch.max(b1[1].unsqueeze(1), b2[1].unsqueeze(0)) xx2 = torch.min(b1[2].unsqueeze(1), b2[2].unsqueeze(0)) yy2 = torch.min(b1[3].unsqueeze(1), b2[3].unsqueeze(0)) inter = area(x1 = xx1, y1 = yy1, x2 = xx2, y2 = yy2) return inter / (area(b1.t()).unsqueeze(1) + area(b2.t()).unsqueeze(0) - inter)def area(boxes = None, x1 = None, y1 = None, x2 = None, y2 = None): return (boxes[..., 3] - boxes[..., 1] + 1) * (boxes[..., 2] - boxes[..., 0] + 1) if boxes is not None else (x2 - x1 + 1).clamp(min = 0) * (y2 - y1 + 1).clamp(min = 0)
Note: I've also had to replace utils/bbox.pyx
by utils/cython_bbox.pyx
and utils/nms.pyx
by utils/cython_nms.pyx
in lib/setup.py
to deal with some setup.py
issues.
还没有评论,说两句吧!
热门资源
Keras-ResNeXt
Keras ResNeXt Implementation of ResNeXt models...
seetafaceJNI
项目介绍 基于中科院seetaface2进行封装的JAVA...
spark-corenlp
This package wraps Stanford CoreNLP annotators ...
capsnet-with-caps...
CapsNet with capsule-wise convolution Project ...
inferno-boilerplate
This is a very basic boilerplate example for pe...
智能在线
400-630-6780
聆听.建议反馈
E-mail: support@tusaishared.com