\documentclass[10pt,letter,notitlepage]{article}
%Mise en page
\usepackage[left=2cm, right=2cm, lines=45, top=0.8in, bottom=0.7in]{geometry}
\usepackage{fancyhdr}
\usepackage{fancybox}
\usepackage{pdfpages}
\renewcommand{\headrulewidth}{1.5pt}
\renewcommand{\footrulewidth}{1.5pt}
\pagestyle{fancy}
\newcommand\Loadedframemethod{TikZ}
\usepackage[framemethod=\Loadedframemethod]{mdframed}
\usepackage{tikz}
\usepackage[linesnumbered,ruled,vlined]{algorithm2e}
%\usepackage{url}
\usepackage{dsfont}
\usepackage{amssymb,amsmath}
\usepackage{xspace}
\usepackage{enumitem}
\lhead{
\textbf{University of Waterloo}
}
\rhead{\textbf{2023 Fall}
}
\chead{\textbf{
CS480/680
}}
\usepackage{stmaryrd}
\newcommand{\RR}{\mathds{R}}
\newcommand{\sign}{\mathop{\mathrm{sign}}}
\newcommand{\argmin}{\mathop{\mathrm{argmin}}}
\newcommand{\zero}{\mathbf{0}}
\newcommand{\one}{\mathbf{1}}
\newcommand{\bv}{\mathbf{b}}
\newcommand{\wv}{\mathbf{w}}
\newcommand{\xv}{\mathbf{x}}
\newcommand{\Xv}{\mathbf{X}}
\newcommand{\Yv}{\mathbf{Y}}
\newcommand{\zv}{\mathbf{z}}
\newcommand{\yv}{\mathbf{y}}
\newcommand{\rv}{\mathbf{r}}
\newcommand{\inner}[2]{\langle #1, #2 \rangle}
\newcommand{\red}[1]{{\color{red}#1}}
\newcommand{\blue}[1]{{\color{blue}#1}}
\newcommand{\magenta}[1]{{\color{magenta}#1}}
\usepackage{ulem}
\newcommand{\ea}{{et al.}\xspace}
\newcommand{\eg}{{e.g.}\xspace}
\newcommand{\ie}{{i.e.}\xspace}
\newcommand{\iid}{{i.i.d.}\xspace}
\newcommand{\cf}{{cf.}\xspace}
\newcommand{\wrt}{{w.r.t.}\xspace}
\newcommand{\aka}{{a.k.a.}\xspace}
\newcommand{\etc}{{etc.}\xspace}
\newcommand{\sgm}{\mathsf{sgm}}
\newcommand{\Dc}{\mathcal{D}}
\newcommand{\ans}[1]{{\color{orange}\textsf{Ans}: #1}}
\newcommand{\pred}[1]{\llbracket#1\rrbracket}
\newcommand{\EE}{\mathds{E}}
\newcommand{\pv}{\mathbf{p}}
\lfoot{}
\cfoot{\textbf{Gautam Kamath (gckamath@uwaterloo.ca) \textcopyright 2023}}
%================================
%================================
\setlength{\parskip}{1cm}
\setlength{\parindent}{1cm}
\tikzstyle{titregris} =
[draw=gray,fill=white, shading = exersicetitle, %
text=gray, rectangle, rounded corners, right,minimum height=.3cm]
\pgfdeclarehorizontalshading{exersicebackground}{100bp}
{color(0bp)=(green!40); color(100bp)=(black!5)}
\pgfdeclarehorizontalshading{exersicetitle}{100bp}
{color(0bp)=(red!40);color(100bp)=(black!5)}
\newcounter{exercise}
\renewcommand*\theexercise{exercice \textbf{Exercice}~n\arabic{exercise}}
\makeatletter
\def\mdf@@exercisepoints{}%new mdframed key:
\define@key{mdf}{exercisepoints}{%
\def\mdf@@exercisepoints{#1}
}
\mdfdefinestyle{exercisestyle}{%
outerlinewidth=1em,outerlinecolor=white,%
leftmargin=-1em,rightmargin=-1em,%
middlelinewidth=0.5pt,roundcorner=3pt,linecolor=black,
apptotikzsetting={\tikzset{mdfbackground/.append style ={%
shading = exersicebackground}}},
innertopmargin=0.1\baselineskip,
skipabove={\dimexpr0.1\baselineskip+0\topskip\relax},
skipbelow={-0.1em},
needspace=0.5\baselineskip,
frametitlefont=\sffamily\bfseries,
settings={\global\stepcounter{exercise}},
singleextra={%
\node[titregris,xshift=0.5cm] at (P-|O) %
{~\mdf@frametitlefont{\theexercise}~};
\ifdefempty{\mdf@@exercisepoints}%
{}%
{\node[titregris,left,xshift=-1cm] at (P)%
{~\mdf@frametitlefont{\mdf@@exercisepoints points}~};}%
},
firstextra={%
\node[titregris,xshift=1cm] at (P-|O) %
{~\mdf@frametitlefont{\theexercise}~};
\ifdefempty{\mdf@@exercisepoints}%
{}%
{\node[titregris,left,xshift=-1cm] at (P)%
{~\mdf@frametitlefont{\mdf@@exercisepoints points}~};}%
},
}
\makeatother
%%%%%%%%%
%%%%%%%%%%%%%%%
\mdfdefinestyle{theoremstyle}{%
outerlinewidth=0.01em,linecolor=black,middlelinewidth=0.5pt,%
frametitlerule=true,roundcorner=2pt,%
apptotikzsetting={\tikzset{mfframetitlebackground/.append style={%
shade,left color=white, right color=blue!20}}},
frametitlerulecolor=black,innertopmargin=1\baselineskip,%green!60,
innerbottommargin=0.5\baselineskip,
frametitlerulewidth=0.1pt,
innertopmargin=0.7\topskip,skipabove={\dimexpr0.2\baselineskip+0.1\topskip\relax},
frametitleaboveskip=1pt,
frametitlebelowskip=1pt
}
\setlength{\parskip}{0mm}
\setlength{\parindent}{10mm}
\mdtheorem[style=theoremstyle]{exercise}{\textbf{Exercise}}
%================Liste definition--numList-and alphList=============
\newcounter{alphListCounter}
\newenvironment
{alphList}
{\begin{list}
{\alph{alphListCounter})}
{\usecounter{alphListCounter}
\setlength{\rightmargin}{0cm}
\setlength{\leftmargin}{0.5cm}
\setlength{\itemsep}{0.2cm}
\setlength{\partopsep}{0cm}
\setlength{\parsep}{0cm}}
}
{\end{list}}
\newcounter{numListCounter}
\newenvironment
{numList}
{\begin{list}
{\arabic{numListCounter})}
{\usecounter{numListCounter}
\setlength{\rightmargin}{0cm}
\setlength{\leftmargin}{0.5cm}
\setlength{\itemsep}{0cm}
\setlength{\partopsep}{0cm}
\setlength{\parsep}{0cm}}
}
{\end{list}}
\usepackage[breaklinks=true,letterpaper=true,linkcolor=magenta,urlcolor=magenta,citecolor=black]{hyperref}
\usepackage{cleveref}
\usepackage{xpatch}
\xpretocmd{\algorithm}{\hsize=\linewidth}{}{}
%===========================================================
\begin{document}
\begin{center}
\large{\textbf{CS480/680: Introduction to Machine Learning} \\ Homework 2\\ Due: 11:59 pm, October 25, 2023, submit on LEARN and CrowdMark.} \\
Include your name and student number!
\end{center}
\begin{center}
Submit your writeup in pdf and all source code in a zip file (with proper documentation).
Fill in the provided stub files, keeping the directory structure. You do not have to submit the provided datasets.
Make sure your code runs!
[Text in square brackets are hints that can be ignored.]
\end{center}
\begin{exercise}[Fun with Classification (5 pts)]
For this problem, you are allowed to use \texttt{statsmodels} and \texttt{sklearn} as directed.
\begin{enumerate}
\item (2 pts) Run logistic regression, SVM with $\ell_2$ regularization with parameter $1$ (soft-margin SVM), and SVM with regularization parameter \texttt{float('inf')} (hard-margin SVM) on Mystery Dataset A (note that there is only a training dataset and no test dataset). Use \texttt{Logit} from \texttt{statsmodels} and \texttt{SVC} (with linear kernel) from \texttt{sklearn}.
One of these methods will not work -- give a mathematically rigorous explanation as to why this happens.
[Think carefully about the loss function used for this method. Look at the error message, and think about what happens in the case indicated.]
How could the associated problems be remedied?
Discuss similarities and differences between the solution obtained via these two working methods.
\ans{}
\item (3 pts) Take your solution for the soft-margin SVM from the previous part.
For each point in the dataset, take its inner product with the produced coefficient vector, and scale the result by the sign of each point's label (replace 0's with -1's). How many of these values are $\leq 1$? [Be sure you're getting all of them -- there may be numerical precision issues, so if in doubt, err on the side of counting a point.]
Based on your answer to these questions, sketch a 2D caricature of what the points and the hyperplane defined by the SVM solution look like.
(A ``caricature'' in this context means a rough sketch of what you think Mystery Dataset A looks like, using the information you have learned in this question and the previous part.
There will be a few key features of the dataset you can emphasize in your caricature.
This might use more information than the literal exact answers to the questions that have been asked (though not much more).
)
Write the parameter vector solution to the SVM problem as a linear combination of some points in your dataset.
How many points did you require?
[You may find built-in functions useful for this purpose.]
Compute the solution for the same three methods on Mystery Dataset B.
You will again run into issues with one of them -- explain why. [The answer is likely to be simpler than last time.]
Find a way to write the parameter vector solution to the SVM problem as a linear combination of some points in your dataset -- do not report the solution itself, but report how many points you used, and how you arrived at this answer.
Compare the empirical prediction accuracy (i.e., using 0-1 loss) of the successfully-trained classifiers on the test set.
\ans{}
\item (Optional -- 0 pts) Construct a dataset in which every point is a support vector.
Do this for the soft margin ($C = 1$) and hard margin ($C = \infty$) cases.
Make sure this dataset has $n \geq 100$ points.
Submit these datasets in the \texttt{data} folder, as well as code required to generate them and demonstrate the number of support vectors.
\ans{}
\end{enumerate}
\end{exercise}
\begin{exercise}[Support Vector Regression (8 pts)]
Let us consider support vector regression:
\begin{align}
\label{eq:CSVM}
\min_{\wv\in \RR^d,
b\in\RR} ~ \frac{1}{2} \|\wv\|_2^2 + C\sum_{i=1}^n \max\{ | y_i - (\wv^\top \xv_i + b)| -\varepsilon , 0 \},
\end{align}
where $\xv_i \in \RR^d$, $y_i \in \RR$, and $\|\wv\|_2 := \sqrt{\sum_{j=1}^d w_j^2}$ is the Euclidean norm.
The above expression is the loss function, the error is simply the latter term $C\sum_{i=1}^n \max\{ | y_i - (\wv^\top \xv_i + b)| -\varepsilon , 0 \}$.
\begin{enumerate}
\item (2 pts) Derive the Lagrangian dual of the support vector regression loss function \eqref{eq:CSVM}. Please include intermediate steps so that you can get partial credit.
\ans{}
\vskip1cm
In the following you will complete and implement the following gradient algorithm for solving support vector regression in \Cref{eq:CSVM}:
\begin{algorithm}[H]
\DontPrintSemicolon
\KwIn{$X\in\RR^{n\times d}$, $\yv\in \RR^n$, $\wv=\zero_d$, $b=0$, $\mathsf{max\_pass} \in \mathds{N}$, step size $\eta$}
\KwOut{$\wv, b$}
\For{$t=1, 2, \ldots, \mathsf{max\_pass}$ }{
\For{$i=1, 2, \ldots, n$}{
choose step size $\eta$
\If{$|y_i - (\inner{\xv_i}{\wv}+b)| \geq \varepsilon$}{
$\wv \gets $ \tcp*{$\xv_i$ is the $i$-th row of $X$}
$b \gets $
}
$\wv \gets $ \tcp*{proximal step}
}
}
\caption{GD for SVR.}
\label{alg:CSVM}
\end{algorithm}
Note that this differs a bit from what you've seen so far, in terms of gradient descent. Rather than taking steps based on the entire loss function, we instead take a step based on the unregularized loss, and then perform a projection step based on the regularizer (sometimes called a ``proximal step'').
\item (2 pts) Compute the gradient with respect to $\wv$ and $b$ for each second term in \Cref{eq:CSVM}.
Note that in places where the function is non-differentiable, you might have to compute a sub-gradient.
\begin{align}
C\sum_{i=1}^n \max\{ | y_i - (\wv^\top \xv_i + b)| -\varepsilon , 0 \}
\end{align}
\ans{}
\item (1 pt) Find the closed-form solution of the following proximal step:
\begin{align}
\mathsf{P}^\eta(\wv) = \argmin_{\zv} ~ \frac{1}{2\eta} \|\zv - \wv\|_2^2 + \frac{1}{2} \|\zv\|_2^2
\end{align}
\ans{}
\item (3 pts) Implement \Cref{alg:CSVM}. You should use part 2 to complete lines 5-6, and part 3 for line 7.
Run it on Mystery Dataset C (this is Mystery Dataset A from Assignment 1, but reused), and report your training error, training loss, and test error. Use $C=1$ and $\varepsilon = 0.5$.
\ans{}
\end{enumerate}
\end{exercise}
\begin{exercise}[Kernels (5 pts)]
For the following questions you might find it useful to recall the definition of a matrix being positive semidefinite (PSD).
A matrix $M \in \mathbb{R}^{d \times d}$ is PSD if and only if $x^T Mx \geq 0$ for all vectors $x \in \mathbb{R}^d$.
It may also be helpful to refresh yourself on Taylor series.
\begin{enumerate}
\item (2 pt) For $x, y \in \mathbb{R}$, consider the kernel function $k(x, y) = \exp\left(-\alpha\left(x - y\right)^2\right)$.
What is the corresponding feature map $\phi(\cdot)$ such that $\phi(x)^T \phi(y) = k(x, y)$?
If you were using this kernel for an SVM model, would you prefer to solve the primal or dual representation? Why?
\ans{}
\item (1 pt) Consider the function $\frac{1}{1 - xy}$, where $x, y \in (-1, 1)$.
Is this function a valid kernel?
If so, write out the corresponding feature map $\phi(\cdot)$, if not, explain why.
\ans{}
\item (1 pt) Consider the function $\log (1 + xy)$, where $0 < x, y \in \mathbb{R}$.
Is this function a valid kernel?
If so, write out the corresponding feature map $\phi(\cdot)$, if not, explain why.
\ans{}
\item (1 pt) Consider the function $\cos(x + y)$, where $x, y \in \mathbb{R}$.
Is this function a valid kernel?
If so, write out the corresponding feature map $\phi(\cdot)$, if not, explain why.
\ans{}
\end{enumerate}
\end{exercise}
\begin{exercise}[Decision Trees (8 pts)]
In this exercise we will implement decision trees for binary classification.
Use the provided stub files.
Recall: decision trees are constructed by repeatedly splitting of nodes.
We split a node by measuring the loss of splitting with respect to each possible feature and threshold, and split based on the feature and threshold that minimizes this loss.
Mathematically:
\[X_L = \{x\ :\ x \in X \wedge x[i] \leq j\},\]
\[X_R = \{x\ :\ x \in X \wedge x[i] > j\},\]
where $x[i]$ is the $i$th coordinate of point $x$.
The vector of labels $y$ is split into vectors $y_L$ and $y_R$ using the same indices.
The loss of splitting a training dataset into a left and right half is computed as
\[\ell(X,y,i,j) = \frac{|y_L|}{|y|}\ell(y_L) + \frac{|y_R|}{|y|}\ell(y_R) \]
We will consider the following loss functions $\ell$, specialized for binary classification.\footnote{Note that these are slightly different from what we discussed in class, since we are only focusing on binary classification. Additionally, there was some implicit rescaling which we omit here.}
Define $\hat p$ for a vector of labels $y$ to be $\frac{|\{y_j = 1 : y_j \in y\}|}{|y|}$, that is, the fraction of labels which are $1$.
We have the following three loss functions.
\noindent Misclassification error:
\[\min \{\hat p, 1-\hat p\}\]
Gini coefficient:
\[\hat p (1 - \hat p)\]
Entropy:
\[- \hat p \log_2 (\hat p) - (1 - \hat p) \log_2 (1 - \hat p ) \]
We do not split a node if it is pure (i.e., consists entirely of either $0$'s or $1$'s), or if a split would exceed a maximum depth hyperparameter provided to the decision tree (recall that the depth of a single-node tree is $0$).
\begin{enumerate} [label=\alph*)]
\item (6 pts) Implement and train decision trees on the provided dataset D. Create a different plot for each of the three loss functions (misclassification error, Gini index, and entropy). The x-axis of each plot should show the maximum depth of the tree (starting from 0), and the y-axis should indicate the accuracy. Include two trend lines, one for the training accuracy and test accuracy. Observe and comment on how the different loss functions perform, and how train and test accuracy change as a function of the maximum depth.
\item (2 pts) Implement and use Bagging on decision trees. Use the entropy loss, and create an ensemble of 101 decision trees, capping the maximum depth at 3. Repeat 11 times, report the median, minimum, and maximum test classification accuracy.
Repeat, but using the random forests method. In particular, at each split, consider only a random $\sqrt{d}$ features when deciding which dimension to split on. Since $\sqrt{d} \approx 3.6$ for the given dataset, choose a random $4$ features for each time.
Report the classification accuracies as before.
Comment on performance between the two, as well as how they perform in comparison to the non-ensemble methods from the previous part.
\end{enumerate}
\end{exercise}
\end{document}