CMU-深度学习系统-hw0

由于作业0实际是走个过场,这里就简单把代码一贴,不多解释

simple_ml.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import struct
import numpy as np
import gzip
try:
from simple_ml_ext import *
except:
pass


def add(x, y):
""" A trivial 'add' function you should implement to get used to the
autograder and submission system. The solution to this problem is in the
the homework notebook.

Args:
x (Python number or numpy array)
y (Python number or numpy array)

Return:
Sum of x + y
"""
### BEGIN YOUR CODE
return x+y
### END YOUR CODE


def parse_mnist(image_filename, label_filename):
""" Read an images and labels file in MNIST format. See this page:
http://yann.lecun.com/exdb/mnist/ for a description of the file format.

Args:
image_filename (str): name of gzipped images file in MNIST format
label_filename (str): name of gzipped labels file in MNIST format

Returns:
Tuple (X,y):
X (numpy.ndarray[np.float32]): 2D numpy array containing the loaded
data. The dimensionality of the data should be
(num_examples x input_dim) where 'input_dim' is the full
dimension of the data, e.g., since MNIST images are 28x28, it
will be 784. Values should be of type np.float32, and the data
should be normalized to have a minimum value of 0.0 and a
maximum value of 1.0 (i.e., scale original values of 0 to 0.0
and 255 to 1.0).

y (numpy.ndarray[dtype=np.uint8]): 1D numpy array containing the
labels of the examples. Values should be of type np.uint8 and
for MNIST will contain the values 0-9.
"""
### BEGIN YOUR CODE
# 处理图片文件
# 先打开文件
with gzip.open(image_filename,'rb') as f:
# 读出文件内容,先获取出图片数量
f.seek(4)
len = int.from_bytes(f.read(4),byteorder="big",signed=True)
f.seek(16)
image = f.read()
X = np.frombuffer(image,dtype = np.uint8).astype(np.float32)
X = X.reshape(len,28*28)/255
with gzip.open(label_filename,'rb') as f1:
f1.seek(4)
lenth = int.from_bytes(f1.read(4),byteorder="big",signed=True)
f1.seek(8)
labels = f1.read()
y = np.frombuffer(labels,dtype=np.uint8)




return X,y
### END YOUR CODE


def softmax_loss(Z, y):
""" Return softmax loss. Note that for the purposes of this assignment,
you don't need to worry about "nicely" scaling the numerical properties
of the log-sum-exp computation, but can just compute this directly.

Args:
Z (np.ndarray[np.float32]): 2D numpy array of shape
(batch_size, num_classes), containing the logit predictions for
each class.
y (np.ndarray[np.uint8]): 1D numpy array of shape (batch_size, )
containing the true label of each example.

Returns:
Average softmax loss over the sample.
"""
Z_exp=np.exp(Z)
Z_exp_sum = np.sum(Z_exp,axis=1)
Z_exp_sum_log = np.log(Z_exp_sum)
rows = np.arange(Z.shape[0])
Z_y = Z[rows,y]
return np.mean(Z_exp_sum_log-Z_y)

### END YOUR CODE


def softmax_regression_epoch(X, y, theta, lr = 0.1, batch=100):
""" Run a single epoch of SGD for softmax regression on the data, using
the step size lr and specified batch size. This function should modify the
theta matrix in place, and you should iterate through batches in X _without_
randomizing the order.

Args:
X (np.ndarray[np.float32]): 2D input array of size
(num_examples x input_dim).
y (np.ndarray[np.uint8]): 1D class label array of size (num_examples,)
theta (np.ndarrray[np.float32]): 2D array of softmax regression
parameters, of shape (input_dim, num_classes)
lr (float): step size (learning rate) for SGD
batch (int): size of SGD minibatch

Returns:
None
"""
### BEGIN YOUR CODE
# 也就是说,针对输入和标签走一步(训练一步)
# 是从X和y的全体中按顺序取batch大小
for i in range (0,X.shape[0],batch):
end = min(i+batch,X.shape[0])
Z = X[i:end,:] @ theta
Z_exp = np.exp(Z)
Z_norm = Z_exp/(np.sum(Z_exp,axis=1)[:,np.newaxis])
I_y = np.zeros((end-i,theta.shape[1]),dtype= int)
for j in range (end-i):
I_y[j,y[i+j]]=1
delta_batch = X[i:end,:].T
delta_batch_temp = (Z_norm-I_y)

print(delta_batch_temp)
delta_batch = delta_batch @ delta_batch_temp
delta = delta_batch/batch

theta[:] =theta - delta*lr
### END YOUR CODE


def nn_epoch(X, y, W1, W2, lr = 0.1, batch=100):
""" Run a single epoch of SGD for a two-layer neural network defined by the
weights W1 and W2 (with no bias terms):
logits = ReLU(X * W1) * W2
The function should use the step size lr, and the specified batch size (and
again, without randomizing the order of X). It should modify the
W1 and W2 matrices in place.

Args:
X (np.ndarray[np.float32]): 2D input array of size
(num_examples x input_dim).
y (np.ndarray[np.uint8]): 1D class label array of size (num_examples,)
W1 (np.ndarray[np.float32]): 2D array of first layer weights, of shape
(input_dim, hidden_dim)
W2 (np.ndarray[np.float32]): 2D array of second layer weights, of shape
(hidden_dim, num_classes)
lr (float): step size (learning rate) for SGD
batch (int): size of SGD minibatch

Returns:
None
"""
### BEGIN YOUR CODE
for i in range (0,X.shape[0],batch):
end = min(i+batch,X.shape[0])
X_batch = X[i:end,:]
y_batch = y[i:end]
I_y = np.zeros((len(y_batch),W2.shape[1]),dtype= int)
for j in range (len(y_batch)):
I_y[j,y_batch[j]]=1
Z_1 = np.maximum(X_batch @ W1,0)
G_2 = np.exp(Z_1 @ W2)/(np.sum(np.exp(Z_1 @ W2),axis = 1)[:,np.newaxis])-I_y
G_1 = (Z_1>0).astype(int) * (G_2 @ W2.T)
delta_W_1 = X_batch.T @ G_1/(end-i)
delta_W_2 = Z_1.T @ G_2/(end-i)
W1 -= lr*delta_W_1
W2 -= lr*delta_W_2
### END YOUR CODE



### CODE BELOW IS FOR ILLUSTRATION, YOU DO NOT NEED TO EDIT

def loss_err(h,y):
""" Helper funciton to compute both loss and error"""
return softmax_loss(h,y), np.mean(h.argmax(axis=1) != y)


def train_softmax(X_tr, y_tr, X_te, y_te, epochs=10, lr=0.5, batch=100,
cpp=False):
""" Example function to fully train a softmax regression classifier """
theta = np.zeros((X_tr.shape[1], y_tr.max()+1), dtype=np.float32)
print("| Epoch | Train Loss | Train Err | Test Loss | Test Err |")
for epoch in range(epochs):
if not cpp:
softmax_regression_epoch(X_tr, y_tr, theta, lr=lr, batch=batch)
else:
softmax_regression_epoch_cpp(X_tr, y_tr, theta, lr=lr, batch=batch)
train_loss, train_err = loss_err(X_tr @ theta, y_tr)
test_loss, test_err = loss_err(X_te @ theta, y_te)
print("| {:>4} | {:.5f} | {:.5f} | {:.5f} | {:.5f} |"\
.format(epoch, train_loss, train_err, test_loss, test_err))


def train_nn(X_tr, y_tr, X_te, y_te, hidden_dim = 500,
epochs=10, lr=0.5, batch=100):
""" Example function to train two layer neural network """
n, k = X_tr.shape[1], y_tr.max() + 1
np.random.seed(0)
W1 = np.random.randn(n, hidden_dim).astype(np.float32) / np.sqrt(hidden_dim)
W2 = np.random.randn(hidden_dim, k).astype(np.float32) / np.sqrt(k)

print("| Epoch | Train Loss | Train Err | Test Loss | Test Err |")
for epoch in range(epochs):
nn_epoch(X_tr, y_tr, W1, W2, lr=lr, batch=batch)
train_loss, train_err = loss_err(np.maximum(X_tr@W1,0)@W2, y_tr)
test_loss, test_err = loss_err(np.maximum(X_te@W1,0)@W2, y_te)
print("| {:>4} | {:.5f} | {:.5f} | {:.5f} | {:.5f} |"\
.format(epoch, train_loss, train_err, test_loss, test_err))



if __name__ == "__main__":
X_tr, y_tr = parse_mnist("data/train-images-idx3-ubyte.gz",
"data/train-labels-idx1-ubyte.gz")
X_te, y_te = parse_mnist("data/t10k-images-idx3-ubyte.gz",
"data/t10k-labels-idx1-ubyte.gz")

print("Training softmax regression")
train_softmax(X_tr, y_tr, X_te, y_te, epochs=10, lr = 0.1)

print("\nTraining two layer neural network w/ 100 hidden units")
train_nn(X_tr, y_tr, X_te, y_te, hidden_dim=100, epochs=20, lr = 0.2)

simple_ml_ext.py

这个地方略坑,由于我之前没系统学过C++,我想当然以为I_y[end-i][k]={0}就可以初始化数组为全0,排错很久,发现C++里面没有对变长数组(VLA)的标准,有些编译器(比如此处)的 VLA 扩展中,使用 {0} 初始化可能不能确保所有位置都被正确置零,从而出现一些未初始化或垃圾值。所以如果没有引入<string.h>来memset的话,建议还是用vector数组。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <cmath>
#include <iostream>

namespace py = pybind11;


void softmax_regression_epoch_cpp(const float *X, const unsigned char *y,
float *theta, size_t m, size_t n, size_t k,
float lr, size_t batch)
{
/**
* A C++ version of the softmax regression epoch code. This should run a
* single epoch over the data defined by X and y (and sizes m,n,k), and
* modify theta in place. Your function will probably want to allocate
* (and then delete) some helper arrays to store the logits and gradients.
*
* Args:
* X (const float *): pointer to X data, of size m*n, stored in row
* major (C) format
* y (const unsigned char *): pointer to y data, of size m
* theta (float *): pointer to theta data, of size n*k, stored in row
* major (C) format
* m (size_t): number of examples
* n (size_t): input dimension
* k (size_t): number of classes
* lr (float): learning rate / SGD step size
* batch (int): SGD minibatch size
*
* Returns:
* (None)
*/

/// BEGIN YOUR CODE
for(int i = 0;i <m;i+=batch){
int end = i+batch>m?m:i+batch;
std::vector<std::vector<float>> X_batch(end-i,std::vector<float>(n));
for(int j=0;j<end-i;j++){
for(int l=0;l<n;l++){
X_batch[j][l] = X[(i+j)*n+l];
}
}
std::vector<std::vector<float>> theta_vector(n,std::vector<float>(k));
for(int j=0;j<n;j++){
for(int l=0;l<k;l++){
theta_vector[j][l] = theta[j*k+l];
}
}
std::vector<std::vector<float>> Z(end-i, std::vector<float>(k, 0));
for (int j = 0; j < end-i; ++j) {
for (int l = 0; l < k; ++l) {
for (int p = 0; p < n; ++p) {
Z[j][l] += X_batch[j][p] * theta_vector[p][l];
}
}
}
std::vector<float> Z_sum(end - i, 0.0f);
for(int j=0;j<end-i;j++){
float sum =0;
for(int l=0;l<k;l++){
Z[j][l] = std::exp(Z[j][l]);
sum += Z[j][l];
}
Z_sum[j] = sum;
}
for(int j=0;j<end-i;j++){
for(int l=0;l<k;l++){
Z[j][l] /= Z_sum[j];
}
}
std::vector<std::vector<int>> I_y(end - i, std::vector<int>(k, 0));
// for (int j = 0; j < end-i; j++) {
// for (int l = 0; l < k; l++) {
// std::cout << I_y[j][l] << " ";
// }
// std::cout << std::endl;
// }
// std::cout << "Mini-batch size (rows): " << (end - i) << ", k = " << k << std::endl;
for (int j = 0; j < end - i; j++){
I_y[j][y[i+j]] = 1;
}

std::vector<std::vector<float>> delta_batch_temp(end - i, std::vector<float>(k, 0));
for(int j=0;j<end-i;j++){
for(int l=0;l<k;l++){
delta_batch_temp[j][l] = Z[j][l] - I_y[j][l];
// std::cout <<"第"<<j<<"行"<<l<<"列,"<< delta_batch_temp[j][l]<<","<<Z[j][l]<<","<< I_y[j][l]<< " ";
}
}
std::vector<std::vector<float>> X_batch_T(n, std::vector<float>(end-i, 0));
for(int j=0;j<n;j++){
for(int l=0;l<end-i;l++){
X_batch_T[j][l]=X_batch[l][j];
}
}
// std::cout << "delta_batch:" << std::endl;
// for (int j = 0; j < n; j++) {
// for (int l = 0; l < end-i; l++) {
// std::cout << X_batch_T[j][l] << " ";
// }
// std::cout << std::endl;
// }
std::vector<std::vector<float>> delta_batch(n, std::vector<float>(k, 0));
for (int j = 0; j < n; j++) {
for (int p = 0; p < end-i; p++) {
for (int l = 0; l < k; l++) {
delta_batch[j][l] += X_batch_T[j][p] * delta_batch_temp[p][l];
}
}
}
// std::cout << "delta_batch:" << std::endl;
// for (int j = 0; j < end-i; j++) {
// for (int l = 0; l < k; l++) {
// std::cout << delta_batch_temp[j][l] << " ";
// }
// std::cout << std::endl;
// }
for(int j=0;j<n;j++){
for(int l=0;l<k;l++){
delta_batch[j][l] /=(end-i);
}
}

for(int j=0;j<n;j++){
for(int l=0;l<k;l++){
theta[j*k+l] -= delta_batch[j][l]*lr;
}
}
}
/// END YOUR CODE
}


/**
* This is the pybind11 code that wraps the function above. It's only role is
* wrap the function above in a Python module, and you do not need to make any
* edits to the code
*/
PYBIND11_MODULE(simple_ml_ext, m) {
m.def("softmax_regression_epoch_cpp",
[](py::array_t<float, py::array::c_style> X,
py::array_t<unsigned char, py::array::c_style> y,
py::array_t<float, py::array::c_style> theta,
float lr,
int batch) {
softmax_regression_epoch_cpp(
static_cast<const float*>(X.request().ptr),
static_cast<const unsigned char*>(y.request().ptr),
static_cast<float*>(theta.request().ptr),
X.request().shape[0],
X.request().shape[1],
theta.request().shape[1],
lr,
batch
);
},
py::arg("X"), py::arg("y"), py::arg("theta"),
py::arg("lr"), py::arg("batch"));
}