CMU-深度学习系统-hw0

2025-03-09 AI Infra, 学习笔记 0 评论字数统计: 2.3k(字) 阅读时长: 13(分)

simple_ml.py
simple_ml_ext.py

由于作业0实际是走个过场，这里就简单把代码一贴，不多解释

simple_ml.py

import struct
import numpy as np
import gzip
try:
    from simple_ml_ext import *
except:
    pass


def add(x, y):
    """ A trivial 'add' function you should implement to get used to the
    autograder and submission system.  The solution to this problem is in the
    the homework notebook.

    Args:
        x (Python number or numpy array)
        y (Python number or numpy array)

    Return:
        Sum of x + y
    """
    ### BEGIN YOUR CODE
    return x+y
    ### END YOUR CODE


def parse_mnist(image_filename, label_filename):
    """ Read an images and labels file in MNIST format.  See this page:
    http://yann.lecun.com/exdb/mnist/ for a description of the file format.

    Args:
        image_filename (str): name of gzipped images file in MNIST format
        label_filename (str): name of gzipped labels file in MNIST format

    Returns:
        Tuple (X,y):
            X (numpy.ndarray[np.float32]): 2D numpy array containing the loaded 
                data.  The dimensionality of the data should be 
                (num_examples x input_dim) where 'input_dim' is the full 
                dimension of the data, e.g., since MNIST images are 28x28, it 
                will be 784.  Values should be of type np.float32, and the data 
                should be normalized to have a minimum value of 0.0 and a 
                maximum value of 1.0 (i.e., scale original values of 0 to 0.0 
                and 255 to 1.0).

            y (numpy.ndarray[dtype=np.uint8]): 1D numpy array containing the
                labels of the examples.  Values should be of type np.uint8 and
                for MNIST will contain the values 0-9.
    """
    ### BEGIN YOUR CODE
    # 处理图片文件
    # 先打开文件
    with gzip.open(image_filename,'rb') as f:
      # 读出文件内容，先获取出图片数量
      f.seek(4)
      len = int.from_bytes(f.read(4),byteorder="big",signed=True)
      f.seek(16)
      image = f.read()
      X = np.frombuffer(image,dtype = np.uint8).astype(np.float32)
      X = X.reshape(len,28*28)/255
    with gzip.open(label_filename,'rb') as f1:
      f1.seek(4)
      lenth = int.from_bytes(f1.read(4),byteorder="big",signed=True)
      f1.seek(8)
      labels = f1.read()
      y = np.frombuffer(labels,dtype=np.uint8)




      return X,y
    ### END YOUR CODE


def softmax_loss(Z, y):
    """ Return softmax loss.  Note that for the purposes of this assignment,
    you don't need to worry about "nicely" scaling the numerical properties
    of the log-sum-exp computation, but can just compute this directly.

    Args:
        Z (np.ndarray[np.float32]): 2D numpy array of shape
            (batch_size, num_classes), containing the logit predictions for
            each class.
        y (np.ndarray[np.uint8]): 1D numpy array of shape (batch_size, )
            containing the true label of each example.

    Returns:
        Average softmax loss over the sample.
    """
    Z_exp=np.exp(Z)
    Z_exp_sum = np.sum(Z_exp,axis=1)
    Z_exp_sum_log = np.log(Z_exp_sum)
    rows = np.arange(Z.shape[0])
    Z_y = Z[rows,y]
    return np.mean(Z_exp_sum_log-Z_y)

    ### END YOUR CODE


def softmax_regression_epoch(X, y, theta, lr = 0.1, batch=100):
    """ Run a single epoch of SGD for softmax regression on the data, using
    the step size lr and specified batch size.  This function should modify the
    theta matrix in place, and you should iterate through batches in X _without_
    randomizing the order.

    Args:
        X (np.ndarray[np.float32]): 2D input array of size
            (num_examples x input_dim).
        y (np.ndarray[np.uint8]): 1D class label array of size (num_examples,)
        theta (np.ndarrray[np.float32]): 2D array of softmax regression
            parameters, of shape (input_dim, num_classes)
        lr (float): step size (learning rate) for SGD
        batch (int): size of SGD minibatch

    Returns:
        None
    """
    ### BEGIN YOUR CODE
    # 也就是说，针对输入和标签走一步（训练一步）
    # 是从X和y的全体中按顺序取batch大小
    for i in range (0,X.shape[0],batch):
      end = min(i+batch,X.shape[0])
      Z = X[i:end,:] @ theta
      Z_exp = np.exp(Z)
      Z_norm = Z_exp/(np.sum(Z_exp,axis=1)[:,np.newaxis])
      I_y = np.zeros((end-i,theta.shape[1]),dtype= int)
      for j in range (end-i):
        I_y[j,y[i+j]]=1
      delta_batch = X[i:end,:].T
      delta_batch_temp = (Z_norm-I_y)

      print(delta_batch_temp)
      delta_batch = delta_batch @ delta_batch_temp
      delta = delta_batch/batch
      
      theta[:] =theta - delta*lr
    ### END YOUR CODE


def nn_epoch(X, y, W1, W2, lr = 0.1, batch=100):
    """ Run a single epoch of SGD for a two-layer neural network defined by the
    weights W1 and W2 (with no bias terms):
        logits = ReLU(X * W1) * W2
    The function should use the step size lr, and the specified batch size (and
    again, without randomizing the order of X).  It should modify the
    W1 and W2 matrices in place.

    Args:
        X (np.ndarray[np.float32]): 2D input array of size
            (num_examples x input_dim).
        y (np.ndarray[np.uint8]): 1D class label array of size (num_examples,)
        W1 (np.ndarray[np.float32]): 2D array of first layer weights, of shape
            (input_dim, hidden_dim)
        W2 (np.ndarray[np.float32]): 2D array of second layer weights, of shape
            (hidden_dim, num_classes)
        lr (float): step size (learning rate) for SGD
        batch (int): size of SGD minibatch

    Returns:
        None
    """
    ### BEGIN YOUR CODE
    for i in range (0,X.shape[0],batch):
      end = min(i+batch,X.shape[0])
      X_batch = X[i:end,:]
      y_batch = y[i:end]
      I_y = np.zeros((len(y_batch),W2.shape[1]),dtype= int)
      for j in range (len(y_batch)):
        I_y[j,y_batch[j]]=1
      Z_1 = np.maximum(X_batch @ W1,0)
      G_2 = np.exp(Z_1 @ W2)/(np.sum(np.exp(Z_1 @ W2),axis = 1)[:,np.newaxis])-I_y
      G_1 = (Z_1>0).astype(int) * (G_2 @ W2.T)
      delta_W_1 = X_batch.T @ G_1/(end-i)
      delta_W_2 = Z_1.T @ G_2/(end-i)
      W1 -= lr*delta_W_1
      W2 -= lr*delta_W_2
    ### END YOUR CODE



### CODE BELOW IS FOR ILLUSTRATION, YOU DO NOT NEED TO EDIT

def loss_err(h,y):
    """ Helper funciton to compute both loss and error"""
    return softmax_loss(h,y), np.mean(h.argmax(axis=1) != y)


def train_softmax(X_tr, y_tr, X_te, y_te, epochs=10, lr=0.5, batch=100,
                  cpp=False):
    """ Example function to fully train a softmax regression classifier """
    theta = np.zeros((X_tr.shape[1], y_tr.max()+1), dtype=np.float32)
    print("| Epoch | Train Loss | Train Err | Test Loss | Test Err |")
    for epoch in range(epochs):
        if not cpp:
            softmax_regression_epoch(X_tr, y_tr, theta, lr=lr, batch=batch)
        else:
            softmax_regression_epoch_cpp(X_tr, y_tr, theta, lr=lr, batch=batch)
        train_loss, train_err = loss_err(X_tr @ theta, y_tr)
        test_loss, test_err = loss_err(X_te @ theta, y_te)
        print("|  {:>4} |    {:.5f} |   {:.5f} |   {:.5f} |  {:.5f} |"\
              .format(epoch, train_loss, train_err, test_loss, test_err))


def train_nn(X_tr, y_tr, X_te, y_te, hidden_dim = 500,
             epochs=10, lr=0.5, batch=100):
    """ Example function to train two layer neural network """
    n, k = X_tr.shape[1], y_tr.max() + 1
    np.random.seed(0)
    W1 = np.random.randn(n, hidden_dim).astype(np.float32) / np.sqrt(hidden_dim)
    W2 = np.random.randn(hidden_dim, k).astype(np.float32) / np.sqrt(k)

    print("| Epoch | Train Loss | Train Err | Test Loss | Test Err |")
    for epoch in range(epochs):
        nn_epoch(X_tr, y_tr, W1, W2, lr=lr, batch=batch)
        train_loss, train_err = loss_err(np.maximum(X_tr@W1,0)@W2, y_tr)
        test_loss, test_err = loss_err(np.maximum(X_te@W1,0)@W2, y_te)
        print("|  {:>4} |    {:.5f} |   {:.5f} |   {:.5f} |  {:.5f} |"\
              .format(epoch, train_loss, train_err, test_loss, test_err))



if __name__ == "__main__":
    X_tr, y_tr = parse_mnist("data/train-images-idx3-ubyte.gz",
                             "data/train-labels-idx1-ubyte.gz")
    X_te, y_te = parse_mnist("data/t10k-images-idx3-ubyte.gz",
                             "data/t10k-labels-idx1-ubyte.gz")

    print("Training softmax regression")
    train_softmax(X_tr, y_tr, X_te, y_te, epochs=10, lr = 0.1)

    print("\nTraining two layer neural network w/ 100 hidden units")
    train_nn(X_tr, y_tr, X_te, y_te, hidden_dim=100, epochs=20, lr = 0.2)

simple_ml_ext.py

这个地方略坑，由于我之前没系统学过C++，我想当然以为I_y[end-i][k]={0}就可以初始化数组为全0，排错很久，发现C++里面没有对变长数组（VLA）的标准，有些编译器（比如此处）的 VLA 扩展中，使用 {0} 初始化可能不能确保所有位置都被正确置零，从而出现一些未初始化或垃圾值。所以如果没有引入<string.h>来memset的话，建议还是用vector数组。

#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <cmath>
#include <iostream>

namespace py = pybind11;


void softmax_regression_epoch_cpp(const float *X, const unsigned char *y,
								  float *theta, size_t m, size_t n, size_t k,
								  float lr, size_t batch)
{
    /**
     * A C++ version of the softmax regression epoch code.  This should run a
     * single epoch over the data defined by X and y (and sizes m,n,k), and
     * modify theta in place.  Your function will probably want to allocate
     * (and then delete) some helper arrays to store the logits and gradients.
     *
     * Args:
     *     X (const float *): pointer to X data, of size m*n, stored in row
     *          major (C) format
     *     y (const unsigned char *): pointer to y data, of size m
     *     theta (float *): pointer to theta data, of size n*k, stored in row
     *          major (C) format
     *     m (size_t): number of examples
     *     n (size_t): input dimension
     *     k (size_t): number of classes
     *     lr (float): learning rate / SGD step size
     *     batch (int): SGD minibatch size
     *
     * Returns:
     *     (None)
     */

    /// BEGIN YOUR CODE
    for(int i = 0;i <m;i+=batch){
      int end = i+batch>m?m:i+batch;
      std::vector<std::vector<float>> X_batch(end-i,std::vector<float>(n));
      for(int j=0;j<end-i;j++){
        for(int l=0;l<n;l++){
          X_batch[j][l] = X[(i+j)*n+l];
        }
      }
      std::vector<std::vector<float>> theta_vector(n,std::vector<float>(k));
      for(int j=0;j<n;j++){
        for(int l=0;l<k;l++){
          theta_vector[j][l] = theta[j*k+l];
        }
      }
      std::vector<std::vector<float>> Z(end-i, std::vector<float>(k, 0));
      for (int j = 0; j < end-i; ++j) {
        for (int l = 0; l < k; ++l) {
          for (int p = 0; p < n; ++p) {
            Z[j][l] += X_batch[j][p] * theta_vector[p][l];
          }
        }
      }
      std::vector<float> Z_sum(end - i, 0.0f);
      for(int j=0;j<end-i;j++){
        float sum =0;
        for(int l=0;l<k;l++){
          Z[j][l] = std::exp(Z[j][l]);
          sum += Z[j][l];
        }
        Z_sum[j] = sum;
      }
      for(int j=0;j<end-i;j++){
        for(int l=0;l<k;l++){
          Z[j][l] /= Z_sum[j];
        }
      }
      std::vector<std::vector<int>> I_y(end - i, std::vector<int>(k, 0));
      // for (int j = 0; j < end-i; j++) {
      //   for (int l = 0; l < k; l++) {
      //       std::cout << I_y[j][l] << " ";
      //   }
      //   std::cout << std::endl;
      // }
      // std::cout << "Mini-batch size (rows): " << (end - i) << ", k = " << k << std::endl;
      for (int j = 0; j < end - i; j++){
        I_y[j][y[i+j]] = 1;
      }
      
      std::vector<std::vector<float>> delta_batch_temp(end - i, std::vector<float>(k, 0));
      for(int j=0;j<end-i;j++){
        for(int l=0;l<k;l++){
          delta_batch_temp[j][l] = Z[j][l] - I_y[j][l];
          // std::cout <<"第"<<j<<"行"<<l<<"列,"<< delta_batch_temp[j][l]<<","<<Z[j][l]<<","<< I_y[j][l]<< " ";
        }
      }
      std::vector<std::vector<float>> X_batch_T(n, std::vector<float>(end-i, 0));
      for(int j=0;j<n;j++){
        for(int l=0;l<end-i;l++){
          X_batch_T[j][l]=X_batch[l][j];
        }
      }
      // std::cout << "delta_batch:" << std::endl;
      // for (int j = 0; j < n; j++) {
      //   for (int l = 0; l < end-i; l++) {
      //       std::cout << X_batch_T[j][l] << " ";
      //   }
      //   std::cout << std::endl;
      // }
      std::vector<std::vector<float>> delta_batch(n, std::vector<float>(k, 0));
      for (int j = 0; j < n; j++) {
        for (int p = 0; p < end-i; p++) {
          for (int l = 0; l < k; l++) {
            delta_batch[j][l] += X_batch_T[j][p] * delta_batch_temp[p][l];
          }
        }
      }
      // std::cout << "delta_batch:" << std::endl;
      // for (int j = 0; j < end-i; j++) {
      //   for (int l = 0; l < k; l++) {
      //       std::cout << delta_batch_temp[j][l] << " ";
      //   }
      //   std::cout << std::endl;
      // }
      for(int j=0;j<n;j++){
        for(int l=0;l<k;l++){
          delta_batch[j][l] /=(end-i);
        }
      }
      
      for(int j=0;j<n;j++){
        for(int l=0;l<k;l++){
          theta[j*k+l] -= delta_batch[j][l]*lr;
        }
      }
    }
    /// END YOUR CODE
}


/**
 * This is the pybind11 code that wraps the function above.  It's only role is
 * wrap the function above in a Python module, and you do not need to make any
 * edits to the code
 */
PYBIND11_MODULE(simple_ml_ext, m) {
    m.def("softmax_regression_epoch_cpp",
    	[](py::array_t<float, py::array::c_style> X,
           py::array_t<unsigned char, py::array::c_style> y,
           py::array_t<float, py::array::c_style> theta,
           float lr,
           int batch) {
        softmax_regression_epoch_cpp(
        	static_cast<const float*>(X.request().ptr),
            static_cast<const unsigned char*>(y.request().ptr),
            static_cast<float*>(theta.request().ptr),
            X.request().shape[0],
            X.request().shape[1],
            theta.request().shape[1],
            lr,
            batch
           );
    },
    py::arg("X"), py::arg("y"), py::arg("theta"),
    py::arg("lr"), py::arg("batch"));
}

本文链接： http://gladiouszhang.github.io/2025/03/09/CMU-深度学习系统-hw0/

版权声明： 本博客所有文章除特别声明外，均采用 CC BY 4.0 CN协议许可协议。转载请注明出处！

Shuyu Zhang (张舒俞)学生

西安电子科技大学硕士在读