\documentclass{beamer} % símbolos menos ambiguos
\usefonttheme{serif}
\usepackage{neuralnetwork}
\usepackage[spanish]{babel}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{dsfont}               % para \mathds (double stroke)
\usepackage[cal=boondox]{mathalfa}% para "o" manuscrita
\newcommand{\oo}{{\mathcal o}}
\newcommand{\phio}{\phi_{\oo}}
\newcommand{\por}{\,}             % multiplicar

\usepackage{Sweave}
\begin{document}
\title{Neuronas artificiales}
\author{\textsc{manadine} -- Análisis de Datos 1}

\begin{frame} [fragile]
  \maketitle
\end{frame}
\section{Introducción}
\begin{frame} [fragile]
  \frametitle{Introducción}
  \begin{itemize}
  \item inspirado en las conexiones (sinapsis)\\ entre neuronas cerebrales
  \item permiten clasificación o regresión
  \item son modelos de regresión no lineal con muchos parámetros\\
    (caja negra: el ajuste no es constructivo)
  \item permiten ajustar información no estructurada (textos, fotos...) pero
    % \url{https://techcrunch.com/2018/01/02/these-psychedelic-stickers-blow-ai-minds/}
    a veces alucinan
  \item tipos
    \begin{itemize}
    \item perceptrón
    \item base radial
    \item mapas autoorganizados
    \item ...
    \end{itemize}
  \end{itemize}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Índice}
  \begin{itemize}
  \item Clasificación
    \begin{itemize}
    \item Perceptrón simple
    \item Perceptrón multicapa (1 oculta)
    \end{itemize}
  \item Regresión
  \item Decaimiento de pesos
  \item Recomendaciones
  \end{itemize}
\end{frame}

\section{Clasificación}
\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
  \begin{neuralnetwork}[height=5]
    \newcommand{\nodetextx}[2]{$\ifnum0=#2 \alpha \else x_{#2}\fi$}
    \newcommand{\nodetexty}[2]{$y_{#2}$}
    \inputlayer[count=4, bias=false, title=Capa de\\entrada, text=\nodetextx]
    \outputlayer[count=3, title=Capa de\\salida, text=\nodetexty] \linklayers
  \end{neuralnetwork}
\end{frame}
\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
  \begin{neuralnetwork}[height=5]
    \newcommand{\nodetextx}[2]{$\ifnum0=#2 \alpha \else x_{#2}\fi$}
    \newcommand{\nodetexty}[2]{$y_{#2}$}
    \inputlayer[count=4, bias=true, title=Capa de\\entrada, text=\nodetextx]
    \outputlayer[count=3, title=Capa de\\salida, text=\nodetexty] \linklayers
  \end{neuralnetwork}
\end{frame}
%% \newcommand{\nodetextclear}[2]{}
%% \setdefaultnodetext{\nodetextclear}
\newcommand{\peso}[4]{\ensuremath{w_{#2k}}}

\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
  \begin{neuralnetwork}[height=5]
    \newcommand{\nodetextx}[2]{$\ifnum0=#2 \alpha_k \else x_{#2}\fi$}
    \newcommand{\nodetexty}[2]{$y_{k}$}
    \inputlayer[count=4, bias=true, title=Capa de\\entrada, text=\nodetextx]
    \outputlayer[count=1, title=Capa de\\salida, text=\nodetexty] %\linklayers
    \link[from layer=0, to layer=1, from node=0, to node=1]
    \link[from layer=0, to layer=1, from node=1, to node=1, label=\peso]
    \link[from layer=0, to layer=1, from node=2, to node=1, label=\peso]
    \link[from layer=0, to layer=1, from node=3, to node=1, label=\peso]
    \link[from layer=0, to layer=1, from node=4, to node=1, label=\peso]
  \end{neuralnetwork}
\end{frame}
\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
  \begin{neuralnetwork}[height=5]
    \newcommand{\nodetextx}[2]{$\ifnum0=#2 1 \else x_{#2}\fi$}
    \newcommand{\nodetexty}[2]{$y_{k}$}
    \newcommand{\pesa}[4]{\ensuremath{\alpha_{k}}}
    \inputlayer[count=4, bias=true, title=Capa de\\entrada, text=\nodetextx]
    \outputlayer[count=1, title=Capa de\\salida, text=\nodetexty] %\linklayers
    \link[from layer=0, to layer=1, from node=0, to node=1, label=\pesa]
    \link[from layer=0, to layer=1, from node=1, to node=1, label=\peso]
    \link[from layer=0, to layer=1, from node=2, to node=1, label=\peso]
    \link[from layer=0, to layer=1, from node=3, to node=1, label=\peso]
    \link[from layer=0, to layer=1, from node=4, to node=1, label=\peso]
  \end{neuralnetwork}
\end{frame}
\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
  \begin{neuralnetwork}[height=5]
    \newcommand{\nodetextx}[2]{$x_{#2}$}
    \newcommand{\nodetexty}[2]{$y_{k}$}
    \inputlayer[count=4, bias=true, title=Capa de\\entrada, text=\nodetextx]
    \outputlayer[count=1, title=Capa de\\salida, text=\nodetexty] %\linklayers
    \link[from layer=0, to layer=1, from node=0, to node=1, label=\peso]
    \link[from layer=0, to layer=1, from node=1, to node=1, label=\peso]
    \link[from layer=0, to layer=1, from node=2, to node=1, label=\peso]
    \link[from layer=0, to layer=1, from node=3, to node=1, label=\peso]
    \link[from layer=0, to layer=1, from node=4, to node=1, label=\peso]
  \end{neuralnetwork}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
    \begin{itemize}
  \item \(\mathcal i\) = entrada (\emph{input})
  \item \(\oo\) = salida (\emph{output})
  \item \(\phi\) = función de activación
  \item \(\alpha\) = constante, sesgo (\emph{bias})
  \item \(w\) = peso (\emph{weight}), coeficiente de sinapsis
    \[ y_k = \phio \left( \alpha_k + \sum_{i=1}^I w_{ik}\por x_i \right) = 
    \phio \left( \sum_{i=0}^I w_{ik}\por x_i \right) \]
  \end{itemize}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
  \begin{itemize}
  \item funciones de activación \(\phi\) habituales
    \begin{itemize}
    \item lineal \[\phio(x) = x\]
    \item logística 
      \[ \phio (x)=\ell(x) = \frac {\exp(x)} {1+\exp(x)} \]
    \item indicatriz, umbral, característica, Heaviside
      \[ \phio(x)= \mathds1_{[0,\infty)}(x) =
      \begin{cases}
        0&\text{si }x<0\\
        1&\text{si }x\geqslant0
      \end{cases}
      \]
    \end{itemize}
  \end{itemize}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
\includegraphics{neuronas-woven-001}
\end{frame}
\begin{frame} [fragile]
  \frametitle{Perceptrón simple: ajuste}
  \begin{itemize}
  \item función de activación:
    \begin{itemize}
    \item para respuesta dicótoma, logística: \( \phi_\oo(x)= \frac {\exp(x)} {1+\exp(x)} \)
    \item para tres o más categorías, lineal: \(\phi_\oo(x)=x\) 
    \end{itemize}
  \item criterios de ajuste \\ 
    \(p=\text{patrón}\quad t=\text{respuesta}\in\{0;1\}
    \quad y=\text{predicción}\)
    \begin{itemize}
    \item para respuesta dicótoma, entropía 
      (\(0\leqslant y\leqslant1\))
      \[E=\sum_p\sum_k\left[
        {t^{(p)}_k}\por\ln\frac{t^{(p)}_k}{y^{(p)}_k} +(1-t^{(p)}_k)\por\ln\frac{1-t^{(p)}_k}{1-y^{(p)}_k}\right]\]
    \item para respuesta múltiple, \emph{softmax} (\(y\in\mathds R\))
      \[ E = \sum_p\sum_k-t^{(p)}_k\por\log {\widehat\Pr}[p\in k]\qquad
        {\widehat\Pr}[p\in k]=\frac{\exp\bigl({y^{(p)}_k}\bigr)}{\sum_{c=1}^K \exp\bigl({y^{(p)}_c}\bigr)} \]
    \end{itemize}
  \end{itemize}
\end{frame}
\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
\begin{Schunk}
\begin{Sinput}
> ## para ahorrar espacio en esta presentación:
> options (width = 58)  
> names(iris)[1:4] <- c("Lsep","Asep","Lpet","Apet")
> library (nnet)   # biblioteca distribuida con R básico
> red <- nnet (Species ~ ., iris, size = 0, skip = TRUE)
\end{Sinput}
\begin{Soutput}
# weights:  15
initial  value 285.325363 
iter  10 value 8.096373
iter  20 value 5.960549
iter  30 value 5.952579
iter  40 value 5.949318
iter  50 value 5.949290
final  value 5.949277 
converged
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
\begin{Schunk}
\begin{Sinput}
> red
\end{Sinput}
\begin{Soutput}
a 4-0-3 network with 15 weights
inputs: Lsep Asep Lpet Apet 
output(s): Species 
options were - skip-layer connections  softmax modelling 
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
\begin{Schunk}
\begin{Sinput}
> summary (red)
\end{Sinput}
\begin{Soutput}
a 4-0-3 network with 15 weights
options were - skip-layer connections  softmax modelling 
 b->o1 i1->o1 i2->o1 i3->o1 i4->o1 
 -2.82   1.96  17.93 -14.07 -11.96 
 b->o2 i1->o2 i2->o2 i3->o2 i4->o2 
 22.60   0.21  -5.82   2.29  -2.75 
 b->o3 i1->o3 i2->o3 i3->o3 i4->o3 
-20.08  -2.26 -12.51  11.73  15.55 
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
\begin{Schunk}
\begin{Sinput}
> names (red)
\end{Sinput}
\begin{Soutput}
 [1] "n"             "nunits"        "nconn"        
 [4] "conn"          "nsunits"       "decay"        
 [7] "entropy"       "softmax"       "censored"     
[10] "value"         "wts"           "convergence"  
[13] "fitted.values" "residuals"     "lev"          
[16] "call"          "terms"         "coefnames"    
[19] "xlevels"      
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
\begin{Schunk}
\begin{Sinput}
> red $ wts
\end{Sinput}
\begin{Soutput}
 [1]  -2.8239735   1.9633937  17.9281992 -14.0713310
 [5] -11.9612859  22.6031112   0.2066642  -5.8225870
 [9]   2.2915637  -2.7476730 -20.0809056  -2.2580146
[13] -12.5066021  11.7273984  15.5503582
\end{Soutput}
\begin{Sinput}
> head (red $ fitted.values)
\end{Sinput}
\begin{Soutput}
  setosa   versicolor    virginica
1      1 6.295937e-19 9.314251e-46
2      1 1.285481e-13 8.803349e-39
3      1 3.076484e-16 3.526606e-42
4      1 1.040148e-13 1.964745e-38
5      1 6.980244e-20 6.771966e-47
6      1 2.379159e-20 7.638111e-46
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
\begin{Schunk}
\begin{Sinput}
> head (red $ fitted.values)
\end{Sinput}
\begin{Soutput}
  setosa   versicolor    virginica
1      1 6.295937e-19 9.314251e-46
2      1 1.285481e-13 8.803349e-39
3      1 3.076484e-16 3.526606e-42
4      1 1.040148e-13 1.964745e-38
5      1 6.980244e-20 6.771966e-47
6      1 2.379159e-20 7.638111e-46
\end{Soutput}
\begin{Sinput}
> summary (apply (red $ fitted.values, 1, sum))
\end{Sinput}
\begin{Soutput}
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      1       1       1       1       1       1 
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
\begin{Schunk}
\begin{Sinput}
> red $ value
\end{Sinput}
\begin{Soutput}
[1] 5.949277
\end{Soutput}
\begin{Sinput}
> indices.fila    <- 1 : nrow (iris)
> indices.columna <- match (iris$Species, 
+                           levels(iris$Species))
> indices <- cbind (indices.fila, indices.columna)
> - sum (log (red$fitted.values [indices]))
\end{Sinput}
\begin{Soutput}
[1] 5.949277
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
\begin{Schunk}
\begin{Sinput}
> aggregate (iris[,1:4], list(iris$Species), median)
\end{Sinput}
\begin{Soutput}
     Group.1 Lsep Asep Lpet Apet
1     setosa  5.0  3.4 1.50  0.2
2 versicolor  5.9  2.8 4.35  1.3
3  virginica  6.5  3.0 5.55  2.0
\end{Soutput}
\begin{Sinput}
> flor <- data.frame(Lsep=6,Asep=2.9,Lpet=5,Apet=1.7)
> predict (red, flor)
\end{Sinput}
\begin{Soutput}
        setosa versicolor virginica
1 2.509364e-20  0.1930841 0.8069159
\end{Soutput}
\begin{Sinput}
> predict (red, flor, type="class")
\end{Sinput}
\begin{Soutput}
[1] "virginica"
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
\begin{Schunk}
\begin{Sinput}
> predict (red, flor)
\end{Sinput}
\begin{Soutput}
        setosa versicolor virginica
1 2.509364e-20  0.1930841 0.8069159
\end{Soutput}
\begin{Sinput}
> summary (red)
\end{Sinput}
\begin{Soutput}
a 4-0-3 network with 15 weights
options were - skip-layer connections  softmax modelling 
 b->o1 i1->o1 i2->o1 i3->o1 i4->o1 
 -2.82   1.96  17.93 -14.07 -11.96 
 b->o2 i1->o2 i2->o2 i3->o2 i4->o2 
 22.60   0.21  -5.82   2.29  -2.75 
 b->o3 i1->o3 i2->o3 i3->o3 i4->o3 
-20.08  -2.26 -12.51  11.73  15.55 
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón simple}
\begin{Schunk}
\begin{Sinput}
> predict (red, flor)
\end{Sinput}
\begin{Soutput}
        setosa versicolor virginica
1 2.509364e-20  0.1930841 0.8069159
\end{Soutput}
\begin{Sinput}
> flor1 <- c (1, as.numeric (flor))
> e1 <- exp (as.numeric (flor1 %*% red$wts[1:5]))
> e2 <- exp (as.numeric (flor1 %*% red$wts[6:10]))
> e3 <- exp (as.numeric (flor1 %*% red$wts[11:15]))
> c(e1,e2,e3) / (e1+e2+e3)
\end{Sinput}
\begin{Soutput}
[1] 2.509364e-20 1.930841e-01 8.069159e-01
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Respuesta dicótoma}
\begin{Schunk}
\begin{Sinput}
> red2 <- nnet (factor(am)~mpg, mtcars, 
+               size=0, skip=TRUE, trace=FALSE)
> predict (red2, data.frame (mpg = 20))
\end{Sinput}
\begin{Soutput}
       [,1]
1 0.3862832
\end{Soutput}
\begin{Sinput}
> 1 / (1 + 1/exp (red2$wts[1] + red2$wts[2] * 20)) #logit
\end{Sinput}
\begin{Soutput}
[1] 0.3862832
\end{Soutput}
\begin{Sinput}
> red2 $ value
\end{Sinput}
\begin{Soutput}
[1] 14.83758
\end{Soutput}
\begin{Sinput}
> p <- red2 $ fitted.values             #una sola columna
> - sum (p * log(p) + (1-p) * log(1-p)) #entropía
\end{Sinput}
\begin{Soutput}
[1] 14.83758
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Respuesta dicótoma}
\begin{Schunk}
\begin{Sinput}
> p <- red2 $ fitted.values             #una sola columna
> - sum (p * log(p) + (1-p) * log(1-p)) #entropía
\end{Sinput}
\begin{Soutput}
[1] 14.83758
\end{Soutput}
\begin{Sinput}
> ## equivale a 
> t <- +(mtcars$am == mtcars$am[1])
> n0 <- function (x) ifelse (is.na(x), 0, x)
> sum (n0(t*log(t/p)) + n0((1-t)*log((1-t)/(1-p))))
\end{Sinput}
\begin{Soutput}
[1] 14.83758
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón multicapa (1 oculta)}
  \begin{neuralnetwork}[height=5]
    \newcommand{\nodetextclear}[2]{}
    \newcommand{\nodetextx}[2]{$x_#2$}
    \newcommand{\nodetexty}[2]{$y_#2$}
    \inputlayer[count=4, bias=false, title=Entrada, text=\nodetextx]
    \hiddenlayer[count=2, bias=false, title=Capa\\oculta, text=\nodetextclear] \linklayers
    \outputlayer[count=3, title=Salida, text=\nodetexty] \linklayers
    \renewcommand{\peso}[4]{\ensuremath{w_{ih}}}
    \link[from layer=0, to layer=1, from node=1, to node=1, label=\peso]
    \renewcommand{\peso}[4]{\ensuremath{w_{hk}}}
    \link[from layer=1, to layer=2, from node=1, to node=1, label=\peso]
  \end{neuralnetwork}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón multicapa (1 oculta)}
  \begin{neuralnetwork}[height=5]
    \newcommand{\nodetextclear}[2]{}
    \newcommand{\nodetextx}[2]{$x_#2$}
    \newcommand{\nodetexty}[2]{$y_#2$}
    \inputlayer[count=4, bias=true, title=Entrada, text=\nodetextx]
    \hiddenlayer[count=2, bias=true, title=Oculta, text=\nodetextclear] \linklayers
    \outputlayer[count=3, title=Salida, text=\nodetexty] \linklayers
    \renewcommand{\peso}[4]{\ensuremath{\alpha_{h}}}
    \link[from layer=0, to layer=1, from node=0, to node=1, label=\peso]
    \renewcommand{\peso}[4]{\ensuremath{\alpha_{k}}}
    \link[from layer=1, to layer=2, from node=0, to node=1, label=\peso]
  \end{neuralnetwork}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón multicapa (1 oculta)}
  \begin{itemize}
  \item  \(h\) = índice de neuronas en capa oculta (\emph{hidden})
    \[ y_k = \phi_{\oo} \left( \alpha_k + \sum_{h=1}^H w_{hk}\por \phi_h \Biggl( \alpha_h + \sum_{i=1}^I w_{ih}\por x_i  \Biggr) \right) \]
    \[ y_k = \phi_{\oo} \left( \sum_{h=0}^H w_{hk} \por\phi_h \Biggl( \sum_{i=0}^I w_{ih}\por x_i  \Biggr) \right) \]
  \item \(\phi\) casi siempre logística en la oculta
    \[ \phi_h(x)=\ell(x) = \frac {\exp(x)} {1+\exp(x)} \]
  \end{itemize}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón multicapa (1 oculta, \texttt{skip=FALSE})}
  \begin{neuralnetwork}[height=5]
    \newcommand{\nodetextclear}[2]{}
    \newcommand{\nodetextx}[2]{$x_#2$}
    \newcommand{\nodetexty}[2]{$y_k$}
    \renewcommand{\peso}[4]{\ensuremath{w_{#2k}}}
    \inputlayer[count=2, bias=true, title=Entrada, text=\nodetextx]
    \hiddenlayer[count=3, bias=true, title=Oculta, text=\nodetextclear, exclude={1,2}]
    \outputlayer[count=1, title=Salida, text=\nodetexty] 
    \link[from layer=0, to layer=1, from node=0, to node=3]
    \link[from layer=0, to layer=1, from node=1, to node=3]
    \link[from layer=0, to layer=1, from node=2, to node=3]
    \link[from layer=1, to layer=2, from node=0, to node=1]
    \link[from layer=1, to layer=2, from node=3, to node=1]
  \end{neuralnetwork}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón multicapa (1 oculta, \texttt{skip=TRUE})}
  \begin{neuralnetwork}[height=5]
    \newcommand{\nodetextclear}[2]{}
    \newcommand{\nodetextx}[2]{$x_#2$}
    \newcommand{\nodetexty}[2]{$y_k$}
    \renewcommand{\peso}[4]{\ensuremath{w_{#2k}}}
    \inputlayer[count=2, bias=true, title=Entrada, text=\nodetextx]
    \hiddenlayer[count=3, bias=true, title=Oculta, text=\nodetextclear, exclude={1,2}]
    \outputlayer[count=1, title=Salida, text=\nodetexty] 
    \link[from layer=0, to layer=1, from node=0, to node=3]
    \link[from layer=0, to layer=1, from node=1, to node=3]
    \link[from layer=0, to layer=1, from node=2, to node=3]
    \link[from layer=1, to layer=2, from node=0, to node=1]
    \link[from layer=1, to layer=2, from node=3, to node=1]
%    \link[from layer=0, to layer=2, from node=0, to node=1, label=\peso]
    \link[from layer=0, to layer=2, from node=1, to node=1, label=\peso]
    \link[from layer=0, to layer=2, from node=2, to node=1, label=\peso]
  \end{neuralnetwork}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón multicapa (1 oculta)}
  \begin{itemize}
  \item  \texttt{skip=FALSE}
    \[ y_k = \phi_{\oo} \left( \sum_{h} w_{hk} \por\phi_h \Biggl( \sum_{i} w_{ih}\por x_i  \Biggr)  \right) \] 
  \item  \texttt{skip=TRUE}
    \[ y_k = \phi_{\oo} \left( \sum_{h} w_{hk} \por\phi_h \Biggl( \sum_{i} w_{ih}\por x_i  \Biggr)  + \sum_i w_{ik}\por x_i \right) \] 
  \item las conexiones \emph{skip} pueden facilitar\\la interpretación de la red neuronal
  \end{itemize}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón multicapa (1 oculta)}
\begin{Schunk}
\begin{Sinput}
> red0 <- nnet (Species ~ ., iris, size = 2,
+               skip = FALSE, trace = FALSE)
> red0
\end{Sinput}
\begin{Soutput}
a 4-2-3 network with 19 weights
inputs: Lsep Asep Lpet Apet 
output(s): Species 
options were - softmax modelling 
\end{Soutput}
\begin{Sinput}
> red1 <- nnet (Species ~ ., iris, size = 2,
+               skip = TRUE, trace = FALSE)
> red1
\end{Sinput}
\begin{Soutput}
a 4-2-3 network with 31 weights
inputs: Lsep Asep Lpet Apet 
output(s): Species 
options were - skip-layer connections  softmax modelling 
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón multicapa (1 oculta)}
\begin{Schunk}
\begin{Sinput}
> summary (red0)
\end{Sinput}
\begin{Soutput}
a 4-2-3 network with 19 weights
options were - softmax modelling 
  b->h1  i1->h1  i2->h1  i3->h1  i4->h1 
   1.03    0.13    0.34   -0.50   -0.92 
  b->h2  i1->h2  i2->h2  i3->h2  i4->h2 
  -1.62   -7.56   -3.33   -5.56   -1.92 
  b->o1  h1->o1  h2->o1 
 -48.26  104.01   -1.35 
  b->o2  h1->o2  h2->o2 
  11.49    1.00    0.28 
  b->o3  h1->o3  h2->o3 
  37.11 -105.44    0.74 
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Perceptrón multicapa (1 oculta)}
\begin{Schunk}
\begin{Sinput}
> summary (red1)
\end{Sinput}
\begin{Soutput}
a 4-2-3 network with 31 weights
options were - skip-layer connections  softmax modelling 
 b->h1 i1->h1 i2->h1 i3->h1 i4->h1 
 -4.93 -29.72 -17.20 -14.72  -4.16 
 b->h2 i1->h2 i2->h2 i3->h2 i4->h2 
  1.31   0.32   4.72  -8.49  -4.16 
 b->o1 h1->o1 h2->o1 i1->o1 i2->o1 i3->o1 i4->o1 
  3.02  -5.74   8.73   3.14   4.90  -8.63  -8.23 
 b->o2 h1->o2 h2->o2 i1->o2 i2->o2 i3->o2 i4->o2 
 20.05  -4.76 -21.75  -0.42   0.69  -1.00  -4.73 
 b->o3 h1->o3 h2->o3 i1->o3 i2->o3 i3->o3 i4->o3 
-22.57   9.81  13.76  -2.89  -5.99   8.43  13.55 
\end{Soutput}
\end{Schunk}
\end{frame}
\section{Regresión}
\begin{frame} [fragile]
  \frametitle{Regresión}
  \begin{itemize}
  \item función de salida lineal: \(\phi_\oo(x)=x\)
  \item teorema
    \begin{itemize}
    \item sea \(f\) cualquier función continua sobre un compacto
    \item se puede aproximar \(f\) uniformemente
    \item basta incrementar el número de neuronas en la capa oculta
    \end{itemize}
  \item la aproximación es ``no constructiva'' 
  \item criterios de ajuste (\(p\)=patrón, \(t\)=objetivo, \(y\)=predicción)
    \begin{itemize}
    \item mínimos cuadrados: \(E = \sum_p\Vert t^{(p)}-y^{(p)}\Vert^2\)
    \end{itemize}
  \end{itemize}

\end{frame}

\begin{frame} [fragile]
  \frametitle{Regresión}
\begin{Schunk}
\begin{Sinput}
> reg <- lm (mpg ~ wt, mtcars)
> print (summary (reg))
\end{Sinput}
\begin{Soutput}
[...]

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)  37.2851     1.8776  19.858  < 2e-16
wt           -5.3445     0.5591  -9.559 1.29e-10

Residual standard error: 3.046 on 30 degrees of freedom
Multiple R-squared: 0.7528, Adjusted R-squared: 0.7446 
F-statistic: 91.38 on 1 and 30 DF,  p-value: 1.294e-10
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame}[fragile]
  \frametitle{}
\includegraphics{neuronas-woven-019}
\end{frame}

\begin{frame} [fragile]
\frametitle{Regresión}
\begin{Schunk}
\begin{Sinput}
> set.seed(1) # para reproducir ejemplo malo
> red <- nnet (mpg ~ wt, mtcars, size = 2, linout = TRUE)
\end{Sinput}
\begin{Soutput}
# weights:  7
initial  value 13690.144505 
iter  10 value 1126.051149
final  value 1126.047233 
converged
\end{Soutput}
\begin{Sinput}
> summary (red)
\end{Sinput}
\begin{Soutput}
a 1-2-1 network with 7 weights
options were - linear output units 
 b->h1 i1->h1 
  3.94   7.04 
 b->h2 i1->h2 
  5.78   9.94 
 b->o h1->o h2->o 
 7.57  4.78  7.75 
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Regresión}
\begin{Schunk}
\begin{Sinput}
> red $ value
\end{Sinput}
\begin{Soutput}
[1] 1126.047
\end{Soutput}
\begin{Sinput}
> sum (red $ residuals ^ 2)
\end{Sinput}
\begin{Soutput}
[1] 1126.047
\end{Soutput}
\begin{Sinput}
> sum (reg $ residuals ^ 2)
\end{Sinput}
\begin{Soutput}
[1] 278.3219
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame}[fragile]
  \frametitle{}
\includegraphics{neuronas-woven-022}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Regresión}
\begin{Schunk}
\begin{Sinput}
> set.seed(2) # mucho mejor cambiando la semilla
> red <- nnet (mpg ~ wt, mtcars, size = 2, linout = TRUE)
\end{Sinput}
\begin{Soutput}
# weights:  7
initial  value 12943.660208 
iter  10 value 962.385509
iter  20 value 251.143680
iter  30 value 202.430605
iter  40 value 202.330422
final  value 202.291329 
converged
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame}[fragile]
  \frametitle{}
\includegraphics{neuronas-woven-024}
\end{frame}


\begin{frame} [fragile]
  \frametitle{Regresión}
\begin{Schunk}
\begin{Sinput}
> set.seed(1) # caso malo; pesos com mayor rango inicial
> red <- nnet (mpg ~ wt, mtcars, size = 2, linout = TRUE,
+              rang = 5)
\end{Sinput}
\begin{Soutput}
# weights:  7
initial  value 12211.292513 
iter  10 value 744.853983
iter  20 value 205.109500
iter  30 value 202.359319
final  value 202.291137 
converged
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame}[fragile]
  \frametitle{}
\includegraphics{neuronas-woven-026}
\end{frame}


\begin{frame} [fragile]
  \frametitle{Regresión}
\begin{Schunk}
\begin{Sinput}
> set.seed(1) # caso malo; tipificando; sobreajuste
> red <- nnet (mpg ~ wt, scale(mtcars), size = 2,
+              linout = TRUE)
\end{Sinput}
\begin{Soutput}
# weights:  7
initial  value 34.908961 
iter  10 value 6.037538
iter  20 value 5.526884
iter  30 value 5.265161
iter  40 value 4.800987
iter  50 value 4.798314
iter  60 value 4.792750
iter  70 value 4.792613
iter  80 value 4.792603
final  value 4.792594 
converged
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame}[fragile]
  \frametitle{}
\includegraphics{neuronas-woven-028}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Regresión}
\begin{Schunk}
\begin{Sinput}
> set.seed(1) # decay para evitar sobreajuste
> red <- nnet (mpg ~ wt, scale(mtcars), size = 2,
+              linout = TRUE, decay=.001)
\end{Sinput}
\begin{Soutput}
# weights:  7
initial  value 34.910311 
iter  10 value 6.087401
iter  20 value 5.574842
iter  30 value 5.178789
iter  40 value 4.927741
iter  50 value 4.924397
iter  60 value 4.924349
iter  60 value 4.924349
iter  60 value 4.924349
final  value 4.924349 
converged
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame}[fragile]
  \frametitle{}
\includegraphics{neuronas-woven-030}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Regresión}
\begin{Schunk}
\begin{Sinput}
> set.seed(1) # decay para evitar sobreajuste
> red <- nnet (mpg ~ wt, scale(mtcars), size = 2,
+              linout = TRUE, decay=.01)
\end{Sinput}
\begin{Soutput}
# weights:  7
initial  value 34.922460 
iter  10 value 6.544019
iter  20 value 5.954441
iter  30 value 5.895883
iter  40 value 5.893031
iter  50 value 5.791990
iter  60 value 5.779464
iter  60 value 5.779464
iter  60 value 5.779464
final  value 5.779464 
converged
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame}[fragile]
  \frametitle{}
\includegraphics{neuronas-woven-032}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Regresión}
\begin{Schunk}
\begin{Sinput}
> set.seed(1) # decay para evitar sobreajuste
> red <- nnet (mpg ~ wt, scale(mtcars), size = 2,
+              linout = TRUE, decay=.1)
\end{Sinput}
\begin{Soutput}
# weights:  7
initial  value 35.043951 
iter  10 value 8.545438
iter  20 value 7.131476
iter  30 value 7.103682
final  value 7.103680 
converged
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame}[fragile]
  \frametitle{}
\includegraphics{neuronas-woven-034}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Regresión}
\begin{Schunk}
\begin{Sinput}
> set.seed(1) # muchas neuronas ocultas; sobreajuste
> red <- nnet (mpg~wt, mtcars, size = 100, linout = TRUE)
\end{Sinput}
\begin{Soutput}
# weights:  301
initial  value 21248.821010 
iter  10 value 229.802676
iter  20 value 203.348311
iter  30 value 200.484030
iter  40 value 191.191581
iter  50 value 169.455915
iter  60 value 144.765317
iter  70 value 134.589994
iter  80 value 116.245707
iter  90 value 101.921759
iter 100 value 91.396961
final  value 91.396961 
stopped after 100 iterations
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame}[fragile]
  \frametitle{}
\includegraphics{neuronas-woven-036}
\end{frame}


\begin{frame} [fragile]
  \frametitle{Regresión}
\begin{Schunk}
\begin{Sinput}
> set.seed(1) # rang no evita sobreajuste
> red <- nnet (mpg~wt, mtcars, size = 100, linout = TRUE,
+              rang=5)
\end{Sinput}
\begin{Soutput}
# weights:  301
initial  value 113453.230339 
iter  10 value 268.817455
iter  20 value 204.853381
iter  30 value 189.632583
iter  40 value 180.592357
iter  50 value 155.232946
iter  60 value 146.266433
iter  70 value 143.899068
iter  80 value 141.962872
iter  90 value 140.209864
iter 100 value 137.646885
final  value 137.646885 
stopped after 100 iterations
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame}[fragile]
  \frametitle{}
\includegraphics{neuronas-woven-038}
\end{frame}


\begin{frame} [fragile]
  \frametitle{Regresión}
\begin{Schunk}
\begin{Sinput}
> set.seed(1) # decay evita sobreajuste
> red <- nnet (mpg~wt, mtcars, size = 100, linout = TRUE,
+              decay=.1)
\end{Sinput}
\begin{Soutput}
# weights:  301
initial  value 21253.252588 
iter  10 value 289.968862
iter  20 value 238.766078
iter  30 value 226.206939
iter  40 value 219.981624
iter  50 value 218.469071
iter  60 value 217.911102
iter  70 value 217.510822
iter  80 value 216.974718
iter  90 value 216.531471
iter 100 value 216.264122
final  value 216.264122 
stopped after 100 iterations
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame}[fragile]
  \frametitle{}
\includegraphics{neuronas-woven-040}
\end{frame}

% \begin{frame} [fragile]
%   \frametitle{Regresión}
% <<>>=
% set.seed(1)
% red <- nnet (mpg ~ wt, mtcars, size = 100, linout = TRUE,
%              trace = FALSE, maxit = 1000)
% red $ value
% set.seed(1)
% red <- nnet (mpg ~ wt, mtcars, size = 100, linout = TRUE,
%              trace = FALSE, maxit = 1e6)
% red $ value
% @ 
% \end{frame}

% \begin{frame}[fragile]
%   \frametitle{}
% <<fig=TRUE,echo=FALSE>>=
% grafpred (red, mtcars)
% @ 
% \end{frame}

\begin{frame} [fragile]
  \frametitle{Regresión}
\begin{Schunk}
\begin{Sinput}
> set.seed(1) # decay no funciona siempre
> red <- nnet (mpg ~ wt, mtcars, size = 2, linout = TRUE,
+              decay = 0.001)
\end{Sinput}
\begin{Soutput}
# weights:  7
initial  value 13690.145855 
iter  10 value 1126.294236
final  value 1126.291748 
converged
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame}[fragile]
  \frametitle{}
\includegraphics{neuronas-woven-042}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Regresión}
\begin{Schunk}
\begin{Sinput}
> set.seed(1) # decay no funciona siempre
> red <- nnet (mpg ~ wt, mtcars, size = 2, linout = TRUE,
+              decay = 0.01)
\end{Sinput}
\begin{Soutput}
# weights:  7
initial  value 13690.158004 
iter  10 value 1128.325971
iter  20 value 1128.069333
final  value 1127.956753 
converged
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame}[fragile]
  \frametitle{}
\includegraphics{neuronas-woven-044}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Regresión}
\begin{Schunk}
\begin{Sinput}
> set.seed(1) # decay no funciona siempre
> red <- nnet (mpg ~ wt, mtcars, size = 2, linout = TRUE,
+              decay = 0.1)
\end{Sinput}
\begin{Soutput}
# weights:  7
initial  value 13690.279495 
iter  10 value 1145.255981
final  value 1143.101582 
converged
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame}[fragile]
  \frametitle{}
\includegraphics{neuronas-woven-046}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Parámetros}
  \begin{itemize}
  \item \texttt{maxit}: número de iteraciones
  \item \texttt{rang}: pesos inicializados según
    \(\mathcal U(-\text{rang}, +\text{rang})\)
  \item \texttt{decay}: {decaimiento de pesos}
      \begin{itemize}\addtolength{\itemsep}{1ex}
  \item pretende evitar óptimos locales al ajustar los \(w_{ij}\)
  \item ajustar \(E+\lambda\por\sum_{i,j}w_{ij}^2\)
  \item tiene sentido si \( 0 \lessapprox x, y \lessapprox 1\)
  \item se aconseja \( 0.001 \lessapprox \lambda \lessapprox 0.1\)
  \end{itemize}
\end{itemize}
\end{frame}

\begin{frame} [fragile]
  \frametitle{Recomendaciones}
  \begin{itemize}
  \item tipificar / normalizar / rescalar las variables
  \item ejecutar varias veces (probar distintas semillas)
  \item validación cruzada
  \end{itemize}
\end{frame}

\begin{frame} [fragile]
  \frametitle{}
\begin{Schunk}
\begin{Sinput}
> valcruz <- function (numneur, decai, partes=10)
+ {
+     iparte <- sample (rep (1:partes,
+                            length.out = nrow(mtcars)))
+     mean (sapply (1:partes,
+             function (i)
+             {
+                 red <- nnet (mpg ~ wt, 
+                              mtcars[iparte!=i,],
+                              linout = TRUE,
+                              size = numneur, 
+                              decay = decai,
+                              trace = FALSE)
+                 mean ((predict (red, 
+                                 mtcars[iparte==i,]) -
+                        mtcars$mpg[iparte==i]) ^ 2)
+             }))
+ }
\end{Sinput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{}
\begin{Schunk}
\begin{Sinput}
> valcruz (  2, 0.001)
\end{Sinput}
\begin{Soutput}
[1] 13.09448
\end{Soutput}
\begin{Sinput}
> valcruz (  2, 0.001)
\end{Sinput}
\begin{Soutput}
[1] 28.91391
\end{Soutput}
\begin{Sinput}
> valcruz ( 10, 0.001)
\end{Sinput}
\begin{Soutput}
[1] 11.78668
\end{Soutput}
\begin{Sinput}
> valcruz ( 10, 0.001)
\end{Sinput}
\begin{Soutput}
[1] 11.84809
\end{Soutput}
\begin{Sinput}
> valcruz (100, 0.001)
\end{Sinput}
\begin{Soutput}
[1] 11.53392
\end{Soutput}
\begin{Sinput}
> valcruz (100, 0.001)
\end{Sinput}
\begin{Soutput}
[1] 11.5865
\end{Soutput}
\begin{Sinput}
> mean (reg $ residuals ^ 2)
\end{Sinput}
\begin{Soutput}
[1] 8.697561
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{}
\begin{Schunk}
\begin{Sinput}
> valcruzreg <- function (partes=10)
+ {
+     iparte <- sample (rep (1:partes,
+                            length.out = nrow(mtcars)))
+     mean (sapply (1:partes,
+             function (i)
+             {
+                 reg <- lm (mpg ~ wt, 
+                            mtcars[iparte!=i,])
+                 mean ((predict (reg, 
+                                 mtcars[iparte==i,]) -
+                        mtcars$mpg[iparte==i]) ^ 2)
+             }))
+ }
\end{Sinput}
\end{Schunk}
\end{frame}
\begin{frame} [fragile]
  \frametitle{}
\begin{Schunk}
\begin{Sinput}
> valcruzreg ()
\end{Sinput}
\begin{Soutput}
[1] 9.345148
\end{Soutput}
\begin{Sinput}
> mean (reg $ residuals ^ 2)
\end{Sinput}
\begin{Soutput}
[1] 8.697561
\end{Soutput}
\end{Schunk}
\end{frame}

\begin{frame} [fragile]
  \frametitle{}
\begin{Schunk}
\begin{Sinput}
> valcruz01 <- function (numneur, decai, partes=10)
+ {
+     iparte <- sample (rep (1:partes,
+                            length.out=nrow(mtcars01)))
+     mean (sapply (1:partes,
+             function (i)
+             {
+                 red <- nnet (mpg ~ wt, 
+                              mtcars01[iparte!=i,],
+                              linout = TRUE,
+                              size = numneur, 
+                              decay = decai,
+                              trace = FALSE)
+                 mean ((predict (red, 
+                                 mtcars01[iparte==i,])
+                        - mtcars01$mpg[iparte==i]) ^ 2)
+             }))
+ }
\end{Sinput}
\end{Schunk}
\end{frame}

\begin{frame}[fragile]
  \frametitle{}
\begin{Schunk}
\begin{Sinput}
> mtcars01 <- data.frame (scale (mtcars))
> mean (lm(mpg~wt,mtcars01) $ residuals ^ 2)
\end{Sinput}
\begin{Soutput}
[1] 0.2394432
\end{Soutput}
\begin{Sinput}
> valcruz01 (2, 0.001)
\end{Sinput}
\begin{Soutput}
[1] 0.2467679
\end{Soutput}
\begin{Sinput}
> valcruz01 (2, 0.001)
\end{Sinput}
\begin{Soutput}
[1] 0.2425711
\end{Soutput}
\begin{Sinput}
> valcruz01 (2, 0.001)
\end{Sinput}
\begin{Soutput}
[1] 0.3180561
\end{Soutput}
\begin{Sinput}
> valcruz01 (2, 0.001)
\end{Sinput}
\begin{Soutput}
[1] 0.2629049
\end{Soutput}
\end{Schunk}
\end{frame}
\begin{frame} [fragile]
  \frametitle{Bibliografía}
  \begin{itemize}
  \item \url{https://cran.r-project.org/view=MachineLearning}
  \item Ripley B.; 1996; Pattern recognition and neural networks; Cambridge University Press
  \item Venables W., Ripley B.; 2002; Modern applied statistics with S; Springer
  \end{itemize}
\end{frame}

\section{Anexo: Retropropagación}

\begin{frame}[plain]
  \title{Anexo: Retropropagación}\author{}\date{}
  \titlepage
\end{frame}

\begin{frame}[fragile]
  \frametitle{Perceptrón simple: retropropagación}

  Para un patrón \(p\):

  \[
  y_k^{(p)} =
  \phio\!\left(
  \sum_{i=0}^I w_{ik}\por x_i^{(p)}
  \right)
  \]

  Error (entropía, respuesta dicótoma):
  \[
  E = \sum_p \sum_k
  \left[
  t_k^{(p)} \por \ln\frac{t_k^{(p)}}{y_k^{(p)}}
  + (1-t_k^{(p)}) \por
  \ln\frac{1-t_k^{(p)}}{1-y_k^{(p)}}
  \right]
  \]

  Objetivo: minimizar \(E\) respecto a \(w_{ik}\).
\end{frame}

\begin{frame}[fragile]
  \frametitle{Perceptrón simple: gradiente}

  Para activación logística:
  \[
  \phio'(z) = y_k(1-y_k)
  \]

  Derivada del error:
  \[
  \frac{\partial E}{\partial w_{ik}}
  =
  \sum_p
  \left(y_k^{(p)} - t_k^{(p)}\right)
  \por x_i^{(p)}
  \]

  Regla de actualización (descenso por gradiente):
  \[
  w_{ik}
  \leftarrow
  w_{ik}
  - \eta
  \sum_p
  \left(y_k^{(p)} - t_k^{(p)}\right)
  \por x_i^{(p)}
  \]

  \(\eta>0\): tasa de aprendizaje.
\end{frame}

\begin{frame}[fragile]
  \frametitle{Perceptrón multicapa (1 oculta)}

  Capa oculta:
  \[
  h_j^{(p)} =
  \phi\!\left(
  \sum_{i=0}^I w_{ij}\por x_i^{(p)}
  \right)
  \]

  Capa de salida:
  \[
  y_k^{(p)} =
  \phio\!\left(
  \sum_{j=0}^J v_{jk}\por h_j^{(p)}
  \right)
  \]

  El error \(E\) es el mismo que antes.
\end{frame}

\begin{frame}[fragile]
  \frametitle{Multicapa: retropropagación}

  Definimos el \emph{error local} en salida:

  \[
  \delta_k^{(p)} =
  y_k^{(p)} - t_k^{(p)}
  \]

  Gradiente en la capa de salida:
  \[
  \frac{\partial E}{\partial v_{jk}}
  =
  \sum_p
  \delta_k^{(p)} \por h_j^{(p)}
  \]

  Actualización:
  \[
  v_{jk}
  \leftarrow
  v_{jk}
  - \eta
  \sum_p
  \delta_k^{(p)} \por h_j^{(p)}
  \]
\end{frame}

\begin{frame}[fragile]
  \frametitle{Multicapa: error en capa oculta}

  Error propagado a la neurona oculta:

  \[
  \delta_j^{(p)}
  =
  \phi'\!\left(
  \sum_i w_{ij}\por x_i^{(p)}
  \right)
  \sum_k
  \delta_k^{(p)} \por v_{jk}
  \]

  Gradiente en pesos de entrada:
  \[
  \frac{\partial E}{\partial w_{ij}}
  =
  \sum_p
  \delta_j^{(p)} \por x_i^{(p)}
  \]

  Actualización:
  \[
  w_{ij}
  \leftarrow
  w_{ij}
  - \eta
  \sum_p
  \delta_j^{(p)} \por x_i^{(p)}
  \]
\end{frame}

\begin{frame}[fragile]
  \frametitle{Resumen: algoritmo de retropropagación}

  Para cada patrón \(p\):

  \begin{enumerate}
  \item Propagación hacia delante:
    calcular \(h_j^{(p)}\), luego \(y_k^{(p)}\).
  \item Calcular errores en salida:
    \(\delta_k^{(p)}\).
  \item Propagar errores hacia atrás:
    \(\delta_j^{(p)}\).
  \item Actualizar pesos:
    \(v_{jk}\), luego \(w_{ij}\).
  \end{enumerate}

  Es un descenso por gradiente aplicado mediante
  regla de la cadena.
\end{frame}

\begin{frame}[fragile]
  \frametitle{Descenso por gradiente vs BFGS (lo que usa \texttt{nnet})}

  Sea \(E(\mathbf{w})\) la función de error y
  \(\nabla E(\mathbf{w})\) su gradiente.

  \vspace{2mm}

  \textbf{1. Descenso por gradiente clásico}

  \[
  \mathbf{w}^{(t+1)}
  =
  \mathbf{w}^{(t)}
  -
  \eta
  \nabla E\!\left(\mathbf{w}^{(t)}\right)
  \]

  \begin{itemize}
  \item \(\eta>0\): tasa de aprendizaje explícita.
  \item Dirección: gradiente negativo.
  \item Convergencia lineal.
  \item Sensible a la elección de \(\eta\).
  \end{itemize}

\end{frame}

\begin{frame}[fragile]
  \frametitle{Descenso por gradiente vs BFGS (lo que usa \texttt{nnet})}

  \vspace{3mm}

  \textbf{2. BFGS (cuasi-Newton)}

  \[
  \mathbf{w}^{(t+1)}
  =
  \mathbf{w}^{(t)}
  -
  H_t^{-1}
  \nabla E\!\left(\mathbf{w}^{(t)}\right)
  \]

  donde \(H_t^{-1}\) aproxima la inversa del Hessiano.

  \begin{itemize}
  \item No hay parámetro \(\eta\) explícito.
  \item Dirección adaptativa usando curvatura.
  \item Búsqueda en línea interna para el tamaño de paso.
  \item Convergencia superlineal (en condiciones regulares).
  \end{itemize}

\end{frame}

\begin{frame}[fragile]
  \frametitle{¿Qué implementa \texttt{nnet} en R?}

  Paquete: \texttt{nnet}

  \vspace{2mm}

  \begin{itemize}
  \item Optimización mediante algoritmo BFGS.
  \item Minimiza directamente la entropía (o SSE).
  \item El tamaño del paso se determina por búsqueda en línea.
  \item No existe parámetro de tasa de aprendizaje \(\eta\).
  \item El argumento \texttt{decay} implementa regularización:
  \[
  E_{\mathrm{reg}}
  =
  E + \lambda \sum w^2
  \]
  \end{itemize}

  \vspace{3mm}

  Por tanto:

  \[
  \texttt{nnet}
  \quad \neq \quad
  \text{descenso por gradiente con tasa fija}
  \]

  Es un método cuasi-Newton determinista,
  adecuado para redes pequeñas y medianas.
\end{frame}

\begin{frame}[fragile]
  \frametitle{Búsqueda en línea: condición de Armijo}

  Sea \(E(\mathbf{w})\) diferenciable y
  \(\mathbf{d}_t\) una dirección de descenso:

  \[
  \nabla E(\mathbf{w}^{(t)})^{\!\top}\mathbf{d}_t < 0
  \]

  Se actualiza:

  \[
  \mathbf{w}^{(t+1)}
  =
  \mathbf{w}^{(t)}
  +
  \alpha_t \mathbf{d}_t
  \]

  \vspace{2mm}

  \textbf{Condición de Armijo (suficiente descenso)}

  Dado \(c \in (0,1)\), se busca \(\alpha_t>0\) tal que:

  \[
  E(\mathbf{w}^{(t)} + \alpha_t \mathbf{d}_t)
  \le
  E(\mathbf{w}^{(t)})
  +
  c \alpha_t
  \nabla E(\mathbf{w}^{(t)})^{\!\top}\mathbf{d}_t
  \]

  \vspace{2mm}

  Interpretación:
  el descenso real debe ser proporcional
  al descenso lineal predicho por el gradiente.
\end{frame}

\begin{frame}[fragile]
  \frametitle{Búsqueda en línea: \it backtracking}

  Procedimiento típico:

  \begin{enumerate}
  \item Fijar \(\alpha = \alpha_0\) (p.ej. 1).
  \item Mientras no se cumpla Armijo:
    \[
    \alpha \leftarrow \rho \alpha,
    \qquad \rho \in (0,1)
    \]
  \item Tomar \(\alpha_t = \alpha\).
  \end{enumerate}

  \vspace{2mm}

  Propiedades:

  \begin{itemize}
  \item Garantiza descenso.
  \item Evita pasos excesivos.
  \item Compatible con descenso por gradiente y BFGS.
  \end{itemize}
\end{frame}
\end{document}

