defcompute(self, a): ### BEGIN YOUR SOLUTION # raise NotImplementedError() ifself.axes isNone: return array_api.swapaxes(a, len(a.shape)-1,len(a.shape)-2) else: return array_api.swapaxes(a, self.axes[0],self.axes[1]) ### END YOUR SOLUTION
Log
1 2 3 4 5
defcompute(self, a): ### BEGIN YOUR SOLUTION # raise NotImplementedError() return array_api.log(a) ### END YOUR SOLUTION
Exp
1 2 3 4 5
defcompute(self, a): ### BEGIN YOUR SOLUTION # raise NotImplementedError() return array_api.exp(a) ### END YOUR SOLUTION
EWisePow
1 2 3 4 5
defcompute(self, a: NDArray, b: NDArray) -> NDArray: ### BEGIN YOUR SOLUTION # raise NotImplementedError() return array_api.power(a,b) ### END YOUR SOLUTION
与此同时,因为这个函数是为了测试计算梯度是否与数值梯度相同,所以它将out_grad视为全1:for x in out.op.gradient_as_tuple(ndl.Tensor(np.ones(out.shape)), out),因为如果op的计算结果就是最终解的话,那么这个out_grad就是全1,和上面保持一致,然后交给自己写的反向传播过程进行验证,得到的结果和数学解进行进行assert。
反向传播里面好几个比较抽象的,都是按这个方式来分析。函数里面具体说。
PowerScalar
1 2 3 4
defgradient(self, out_grad, node): ### BEGIN YOUR SOLUTION return out_grad * self.scalar * node.inputs[0]**(self.scalar-1) ### END YOUR SOLUTION
EWiseDiv
1 2 3 4
defgradient(self, out_grad, node): ### BEGIN YOUR SOLUTION return out_grad/node.inputs[1], out_grad*node.inputs[0]*(-node.inputs[1]**(-2)) ### END YOUR SOLUTION
DivScalar
1 2 3 4
defgradient(self, out_grad, node): ### BEGIN YOUR SOLUTION return out_grad / self.scalar ### END YOUR SOLUTION
# 找到所有需要求和的轴(被扩展的轴) axes = [] original_ndim = len(original_shape) for i inrange(len(broadcasted_shape)): if i >= original_ndim or original_shape[i] != broadcasted_shape[i]: axes.append(i)
# 沿这些轴求和,恢复原始形状 grad = summation(out_grad, axes=tuple(axes)) return grad.reshape(original_shape)
Reshape
1 2 3 4
defgradient(self, out_grad, node): ### BEGIN YOUR SOLUTION return reshape(out_grad, node.inputs[0].shape) ### END YOUR SOLUTION
Negate
1 2 3 4
defgradient(self, out_grad, node): ### BEGIN YOUR SOLUTION return -1 * out_grad ### END YOUR SOLUTION
Transpose
1 2 3 4
defgradient(self, out_grad, node): ### BEGIN YOUR SOLUTION return transpose(out_grad, self.axes) ### END YOUR SOLUTION
Log
1 2 3 4
defgradient(self, out_grad, node): ### BEGIN YOUR SOLUTION return out_grad * power_scalar(node.inputs[0], -1) ### END YOUR SOLUTION
Exp
1 2 3 4
defgradient(self, out_grad, node): ### BEGIN YOUR SOLUTION return out_grad * exp(node.inputs[0]) ### END YOUR SOLUTION
EWisePow
1 2 3 4 5
defgradient(self, out_grad, node): ### BEGIN YOUR SOLUTION return out_grad * (node.inputs[1]) * power(node.inputs[0], node.inputs[1]-1), \ out_grad * log(node.inputs[0]) * power(node.inputs[0], node.inputs[1]) ### END YOUR SOLUTION
deffind_topo_sort(node_list: List[Value]) -> List[Value]: """Given a list of nodes, return a topological sort list of nodes ending in them. A simple algorithm is to do a post-order DFS traversal on the given nodes, going backwards based on input edges. Since a node is added to the ordering after all its predecessors are traversed due to post-order DFS, we get a topological sort. """ ### BEGIN YOUR SOLUTION # raise NotImplementedError() # 输入是汇点
topo_order = [] topo_sort_dfs(node_list[0],False,topo_order) return topo_order ### END YOUR SOLUTION
deftopo_sort_dfs(node, visited, topo_order): """Post-order DFS""" ### BEGIN YOUR SOLUTION # raise NotImplementedError() if visited or node isNone: return # 可以不要,但是要在下面的input[0]加上判断,但是用这句话可以省事 if node.is_leaf(): topo_order.append(node) return topo_sort_dfs(node.inputs[0],node.inputs[0] in topo_order,topo_order) # 很重要! iflen(node.inputs)>1: topo_sort_dfs(node.inputs[1],node.inputs[1] in topo_order,topo_order) topo_order.append(node)
defcompute_gradient_of_variables(output_tensor, out_grad): """Take gradient of output node with respect to each node in node_list. Store the computed result in the grad field of each Variable. """ # a map from node to a list of gradient contributions from each output node node_to_output_grads_list: Dict[Tensor, List[Tensor]] = {} # Special note on initializing gradient of # We are really taking a derivative of the scalar reduce_sum(output_node) # instead of the vector output_node. But this is the common case for loss function. node_to_output_grads_list[output_tensor] = [out_grad]
# Traverse graph in reverse topological order given the output_node that we are taking gradient wrt. reverse_topo_order = list(reversed(find_topo_sort([output_tensor])))
### BEGIN YOUR SOLUTION # raise NotImplementedError() for node in reverse_topo_order: node.grad = sum_node_list(node_to_output_grads_list[node]) if node.is_leaf(): continue grads = node.op.gradient(node.grad,node) ifisinstance(grads,Tuple): for grad, index inzip(grads, range(len(grads))): if node.inputs[index] in node_to_output_grads_list: node_to_output_grads_list[node.inputs[index]].append(grad) else: node_to_output_grads_list[node.inputs[index]] = [grad] else: if node.inputs[0] in node_to_output_grads_list: node_to_output_grads_list[node.inputs[0]].append(grads) else: node_to_output_grads_list[node.inputs[0]] = [grads] ### END YOUR SOLUTION
defsoftmax_loss(Z, y_one_hot): """Return softmax loss. Note that for the purposes of this assignment, you don't need to worry about "nicely" scaling the numerical properties of the log-sum-exp computation, but can just compute this directly. Args: Z (ndl.Tensor[np.float32]): 2D Tensor of shape (batch_size, num_classes), containing the logit predictions for each class. y (ndl.Tensor[np.int8]): 2D Tensor of shape (batch_size, num_classes) containing a 1 at the index of the true label of each example and zeros elsewhere. Returns: Average softmax loss over the sample. (ndl.Tensor[np.float32]) """ ### BEGIN YOUR SOLUTION # raise NotImplementedError() Z_exp = ndl.exp(Z) Z_exp_sum = ndl.summation(Z_exp, axes=1) Z_exp_sum_log = ndl.log(Z_exp_sum) Z_y = ndl.summation(Z*y_one_hot,axes = 1) loss = ndl.summation(Z_exp_sum_log - Z_y)/Z.shape[0] return loss ### END YOUR SOLUTION
defnn_epoch(X, y, W1, W2, lr=0.1, batch=100): """Run a single epoch of SGD for a two-layer neural network defined by the weights W1 and W2 (with no bias terms): logits = ReLU(X * W1) * W1 The function should use the step size lr, and the specified batch size (and again, without randomizing the order of X). Args: X (np.ndarray[np.float32]): 2D input array of size (num_examples x input_dim). y (np.ndarray[np.uint8]): 1D class label array of size (num_examples,) W1 (ndl.Tensor[np.float32]): 2D array of first layer weights, of shape (input_dim, hidden_dim) W2 (ndl.Tensor[np.float32]): 2D array of second layer weights, of shape (hidden_dim, num_classes) lr (float): step size (learning rate) for SGD batch (int): size of SGD mini-batch Returns: Tuple: (W1, W2) W1: ndl.Tensor[np.float32] W2: ndl.Tensor[np.float32] """
### BEGIN YOUR SOLUTION # raise NotImplementedError() num_classes = W2.shape[1] for i inrange(0, X.shape[0], batch): end = min(i + batch, X.shape[0]) X_batch = X[i:end, :] y_batch = y[i:end]