adjustments to appendix and task splits

5a7c28bb · ak · afce1bbd · 5a7c28bb
Commit 5a7c28bb authored 1 year ago by ak
--- a/iclr2024_conference.tex
+++ b/iclr2024_conference.tex
@@ -230,21 +230,23 @@
 	
 	% Balancing of Tasks/Classes
 	Tasks $T_{i}$ contain all samples of the corresponding classes defining them, see \cref{tab:slts} for details. 
-	It is assumed that data from all tasks occur with equal probability. Some datasets are slightly unbalanced, for example Fruits and SVHN classes 1 and 2, which may render certain sub-task settings as more difficult.
+	It is assumed that data from all tasks occur with equal probability. Some datasets are unbalanced, for example Fruits and SVHN classes 1 and 2, which may render certain sub-task settings as more difficult.
 	% Initial/Replay
 	Training consists of an (initial) run on $T_1$, followed by a sequence of independent (replay) runs on $T_{i}, i>1$.
 	% Averaged over runs & baseline experiments
 	We perform ten randomly initialized runs for each CIL-Problem, and conduct baseline experiments for all datasets to measure the offline joint-class training performance. We set the training mini-batch size to $\beta=100$ ($\beta=50$ for the Fruits dataset).
 	%
 	\begin{table}[h!]
-		\scriptsize
-		\renewcommand{\arraystretch}{.9}
+		\tiny
+		\renewcommand{\arraystretch}{.75}
 		\centering
 		\begin{tabular}{ c:c: c | c | c | c | c | c }
 			& task split & {$T_1$} & {$T_2$} & {$T_3$} & {$T_4$} & {$T_5$} & {$T_6$} \\[0.2ex]
 			\cdashline{1-8}
 			&  &  &  &  &  &  \\
-			\multirow[c]{3}{*}[0in]{\rotatebox{90}{CIL-P.}} & \multicolumn{1}{:c:}{D5-$1^5$A} & [0-4] & 5 & 6 & 7 & 8 & 9 \\
+			\multirow[c]{4}{*}[0in]{\rotatebox{90}{CIL-P.}}
+			& \multicolumn{1}{:c:}{D2-$2^4$A} & [0,1] & [2,3] & [4,5] & [6,7] & [8,9] & / \\
+			& \multicolumn{1}{:c:}{D5-$1^5$A} & [0-4] & 5 & 6 & 7 & 8 & 9 \\
 			%\hline
 			%	D6-$1^4$A & [0-5] & 6 & 7 & 8 & 9 & / \\
 			% 	D6-$1^4$B & [4-9] & 0 & 1 & 2 & 3 & / \\
@@ -258,13 +260,14 @@
 		       task split & {$T_1$} & {$T_2$} & {$T_3$} & {$T_4$} & {$T_5$} & {$T_6$}  \\[0.2ex]
 		       \cdashline{1-7}
 		       &  &  &  &  &  &  \\
+		       D2-$2^4$B & [8,9] & [6,7] & [4,5] & [2,3] & [0,1] & / \\
 		       D5-$1^5$B & [5-9] & 0 & 1 & 2 & 3 & 4 \\
 		       %\hline
 		       D7-$1^3$B & [3-9] & 0 & 1 & 2 & / & / \\
 		       %\hline
 		       D20-$1^5$B & [5-24] & 0 & 1 & 2 & 3 & 4 \\
 		\end{tabular}
-		\caption{Showcase of the investigated CL/CIL-Problems. Each task $T_i$ contains image and label data $(X,Y)$ from each of the corresponding classes. 
+		\caption{Showcase of the investigated CL/CIL-Problems. Each task $T_i$ contains image and label data pairs $(X,Y)$ from the corresponding classes. 
 		D20-$1^5$A and D20-$1^5$B are exclusive for E-MNIST.
 		%Initial task $T_1$ data is balanced w.r.t classes. 
 		\label{tab:slts}
@@ -608,30 +611,26 @@
 	\clearpage
 	\section{Deep Generative Replay training}\label{app:dgr}
 	\begin{table}
-		
 		\setlength{\arrayrulewidth}{0.25mm}
 		\renewcommand{\arraystretch}{1.25}
 		%\setlength{\tabcolsep}{16pt}
 		\centering
-		\scriptsize
+		\tiny
 		\begin{tabular}{ lc:lc:lc}
 			\textbf{Component} & \textbf{Layer} & \multicolumn{4}{|c}{...} \\
 			\hline
-			\hline
 			& & & & & \\
 			\textbf{Encoder} & C2D(32,5,2)-ReLU
 			& \textbf{Decoder} & Dense(100)-ReLU
-			& \textbf{Solver} & C2D(32,5,1)-ReLU \\
-			
-			& C2D(64,5,2)-ReLU & & Dense((H/4)*(W/4)*64)-ReLU & & MP2D(2) \\
-			& Flatten & & Reshape((H/4),(W/4),64)-ReLU & & C2D(64,5,1)-ReLU \\
-			& Dense(100)-ReLU & & C2DTr(32,5,2)-ReLU & & MP2D(2) \\
-			& Dense(25)-ReLU & & C2DTr(C,5,2)-Sig. & & Flatten \\
-			& Dense(50) & & & & Dense(100)-ReLU \\
-			& & & & & Dense(10)-Softmax \\
+			& \textbf{Solver} & Flatten \\
+			
+			& C2D(64,5,2)-ReLU & & Dense((H/4)*(W/4)*64)-ReLU & & Dense(400)-ReLU \\
+			& Flatten & & Reshape((H/4),(W/4),64)-ReLU & & Dense(400)-ReLU \\
+			& Dense(100)-ReLU & & C2DTr(32,5,2)-ReLU & & Dense(400)-ReLU \\
+			& Dense(25)-ReLU & & C2DTr(C,5,2)-Sig. & & Dense(10)-Softmax \\
+			& Dense(50) & & & &  \\
 			\multicolumn{6}{c}{...} \\
 			\hline
-			\hline
 			%\cdashline{1-6}
 			& & & & & \\
 			\textbf{LR-Encoder} & Flatten
@@ -642,7 +641,7 @@
 			& Dense(25)-ReLU & & Reshape(N,H,W,C) & & Dense(10)-Softmax \\
 			& Dense(50) & & & & \\
 		\end{tabular}
-		\caption{DNN architectures for VAE-based replay (all components) and ER (solvers only). A VAE generator consists of a mirrored encoder and decoder network. Components from the first row are utilized for MNIST, FMNIST, E-MNIST and Fruits-360, while second row components are deployed for latent replay (LR) on SVHN and CIFAR.  
+		\caption{DNN architectures for VAE-based replay. A VAE generator consists of a mirrored encoder-decoder network. Components from the first row are utilized for MNIST, FMNIST, E-MNIST and Fruits-360. Second row components are deployed for latent replay on SVHN and CIFAR.  
 		\label{tab:networkstructure}
 		}
 	\end{table}
@@ -653,16 +652,34 @@
 	% Training iterations, i.e., the amount of steps over the constituted mini-batches $\beta$, are calculated dynamically for each task. This affects the balanced mixing strategy, as $D_i$ grows linearly, affecting the training duration negatively. 
 	%This is due to the fact that $\mathcal{D}_{T_{i>1}}$ is significantly smaller than $\mathcal{D}_{T_{1}}$ in a constant-setting.
 	%
-	
 	The VAE latent dimension is 25, the disentangling factor $\beta=1.0$, and conditional sampling is turned off for MNIST, F-MNIST, E-MNIST and Fruits-360 datasets, whereas it is turned on for SVHN and CIFAR to enforce that the generator naturally produces samples from previously seen classes in equal proportions. For these datasets, we also operate on latent features and use fully-connected DNNs as encoder and decoder, see \cref{tab:networkstructure}.
-	
-	The learning rate for solvers and VAE generators are $\epsilon_S=10^{-4}$, $\epsilon_G=10^{-3}$ using the ADAM optimizer with $\beta_{1}=0.9$, $\beta_{2}=0.999$. Generator and solver training is performed for 100 and 50 epochs respectively. We reinitialize both structures before each new task on MNIST, FashionMNIST, E-MNIST and Fruits-360.
+	%
+	The learning rate for VAE generators and solvers are set to $\epsilon_G=10^{-4}$, $\epsilon_S=10^{-3}$ using the ADAM optimizer with $\beta_{1}=0.9$, $\beta_{2}=0.999$. Generators and solvers are trained for 100 epochs each. We reinitialize the solver network for SVHN and CIFAR before each new task, as this has shown a stabilizing effect in our empirical studies. For MNIST, FashionMNIST, E-MNIST and Fruits-360, the same structures are maintained throughout the replay training.
+	%
 	%-------------------------------------------------------------------------
 	\section{Experience Replay training}\label{app:er}
-	The solvers for ER are taken to be the same as for VAE-DGR (see the rightmost column in  \cref{tab:networkstructure}). The ADAM optimizer is used with a learning rate of $10^{-4}$, $\beta_{1}=0.9$, $\beta_{2}=0.999$, and the network is trained for 50 epochs on each task. Analogous to the procedure for DGR, we use replay on latent feature representations, see e.g., \cite{pellegrini2020latent} encoded by a pre-trained feature extractor as described in \cref{app:fm} for SVHN and CIFAR.
+	The solvers for ER are shown in \cref{tab:er_network}. The ADAM optimizer is used with a learning rate of $10^{-4}$, $\beta_{1}=0.9$, $\beta_{2}=0.999$, and the network is trained for 50 epochs on each task. Analogous to the procedure for DGR, we use replay on latent feature representations, see e.g., \cite{pellegrini2020latent} encoded by a pre-trained feature extractor as described in \cref{app:fm} for SVHN and CIFAR.
 	
 	Similar to \cite{riemer2018learning}, reservoir sampling is used to select $50$ samples of each encountered class to be stored. For replay, oversampling of the buffer is performed to obtain a number of samples, equal to the amount of data instances present in the current task $T_i$. 
 	
 	Thus, we choose an ER implementation that has constant time complexity, although the number of distinct samples per task will decrease over time. At some point, CL will break down because there are too few distinct samples per task to protect previously acquired knowledge.
+	\begin{table}
+	\setlength{\arrayrulewidth}{0.25mm}
+	\renewcommand{\arraystretch}{1.25}
+	%\setlength{\tabcolsep}{16pt}
+	\centering
+	\tiny
+	\begin{tabular}{ l : l }
+		\textbf{ER-Solver} & \textbf{Layers} \\ 
+		\hline
+						& \\
+						& C2D(32,5,1)-ReLU $\rightarrow$ MP2D(2) \\
+						& C2D(64,5,1)-ReLU $\rightarrow$ MP2D(2) \\
+						& Flatten $\rightarrow$ Dense(100)-ReLU $\rightarrow$ Dense(10)-Softmax
+	\end{tabular}
+	\caption{DNN architecture for the ER solver used for the MNIST, FashionMNIST, E-MNIST and Fruits-360 datasets. For latent replay, the LR solver network shown in \cref{tab:networkstructure} (bottom-right) is used. 
+	\label{tab:er_network}
+	}
+	\end{table}
 	% -----------------------------------------------------------------
 \end{document}