Model Notes Revised 1

2017/03/22

learning

Reconstruct

Learning:

$H^{word}_{enc,i,j} = \begin{cases} f^{GRU}_{\Theta}(x_{i,j}) & \quad i \in [1,n],j=1\\ f^{rnn}_{\Theta}(x_{i,j},H^{word}_{enc,i,j-1}) & \quad i \in [1,n],j \in [2,m] \end{cases}$
$H^{con}_{enc,i} = \begin{cases} f^{GRU}_{\Phi}(H^{word}_{enc,i,-1}) & \quad i=1\\ f^{GRU}_{\Phi}(H^{word}_{enc,i,-1},H^{con}_{enc,i-1}) & \quad i \in [2,n] \end{cases}$
$H^{enc2lat}_{i} = \begin{cases} \begin{cases} f^{rnn}_{\vec z_1}(init) & \quad i \prime=n \\ f^{rnn}_{\vec z_1}(H^{con}_{enc,i\prime+1},H^{con \prime}_{enc,i\prime+1}) & \quad i \prime \in[n-1,1] \end{cases}& \quad i\prime=i=1\\ f^{rnn}_{\vec z_i}(H^{con}_{enc,i},\vec z_{i-1}) \text{ or concat\sum}(H^{con}_{enc,i},\vec z_{i-1}) & \quad i \in [2,n] \end{cases}$

$\vec z_i=\sim\mathcal{N}(.|f^{mlp}_{\mu}(H^{enc2lat}_{i}),f^{mlp}_{\sigma}(H^{enc2lat}_{i}))$
$H^{con}_{dec,i} =f_{lat2dec}(\vec z_i)$
$H^{word}_{dec,i,j} = \begin{cases} f^{GRU}_{\Omega}(H^{con}_{dec,i}) & \quad i \in [1,n],j =1\\ f^{GRU}_{\Omega}(H^{con}_{dec,i},\hat x_{i,j},H^{word}_{dec,i,j-1}) & \quad i \in [1,n],j \in [2,m]\end{cases}$

Reconstruct:

$H^{enc2lat}_{i} =f^{rnn}_{\vec z_i}(H^{con}_{dec,i-1},\vec z_{i-1}) \text{ or concat\sum}(H^{con}_{dec,i-1},\vec z_{i-1}) \quad i \in[2,n]$
$\vec{\mu_1}_{empirical}=avg(\vec{\mu_1}_{training})$
$\vec{\sigma_1}_{empirical}=avg(\vec{\sigma_1}_{training})$
$\vec{z_i} \sim \begin{cases} \mathcal{N}(\vec{\mu_1}_{empirical},\vec{\sigma_1}_{empirical}) & \quad i=1\\ \mathcal{N}(f^{mlp}_{\mu}(H^{enc2lat}_{i}),f^{mlp}_{\sigma}(H^{enc2lat}_{i})) & \quad i \in[2,n]\end{cases}$
$H^{con}_{dec,i} =f_{lat2dec}(\vec z_i)$
$H^{word}_{dec,i,j} = \begin{cases} f^{GRU}_{\Omega}(H^{con}_{dec,i}) & \quad i \in [1,n],j =1\\ f^{GRU}_{\Omega}(H^{con}_{dec,i},\hat x_{i,j},H^{word}_{dec,i,j-1}) & \quad i \in [1,n],j \in [2,m]\end{cases}$

KL Objective

$\begin{equation}\begin{split}\log p_{\theta}(x)&= \log \int_{z} p_{\theta}(x,z) &\\ &= \log \int_{z} q_\phi (z|x) \frac{p_{\theta}(x,z)}{q_\phi(z|x)} &\\ &\ge \int_{z} q(z|x) \log \frac{p(x,z)}{q(z|x)} \text{(Jensen's inequality)} & \\ &= \mathbb E_{z\sim q(z|x)} [\log p(x,z)-q(z|x)] &\\ &\text{if }\log p(x,z)=\log p(x)+\log p(z|x) &\text{else }\log p(x,z)=\log p(x|z)+\log p(z)\\ &= \mathbb E_{z\sim q(z|x)} [\log p(x)+\log p(z|x)-q(z|x)] & =\mathbb E_{z\sim q(z|x)} [\log p(x|z)+\log p(z)-q(z|x)]\\ &= \mathbb E_{z\sim q(z|x)} [\log p(x)-[-\log p(z|x)+q(z|x)]] & =\mathbb E_{z\sim q(z|x)} [\log p(x|z)-[-\log p(z)+q(z|x)]]\\ &= \mathbb E_{z\sim q(z|x)} [\log p(x)-\log \frac{q(z|x)}{p(z|x)}] &=\mathbb E_{z\sim q(z|x)} [\log p(x|z)-\log \frac{q(z|x)}{p(z)}]\\ &= - \mathbb E_{z\sim q(z|x)} [\log \frac{q(z|x)}{p(z|x)}]+\log p(x) &=- \mathbb E_{z\sim q(z|x)} [\log \frac{q(z|x)}{p(z)}]+\log p(x|z)\\ &= - D_{KL}(q_\phi(z|x)||p_\theta(z|x))+\log p_\theta(x) &=- D_{KL}(q_\phi(z|x)||p_\theta(z))+\log p_\theta(x|z)\\ & &= {\cal L}(x,\theta,\phi)&\\\end{split}\end{equation}$

KL Objective

$$D_{KL}(q||p) \\D_{KL}(q(z_1|x_1),...,q(z_i|z_{i-1},x_i)||p(z_1),...,p(z_i|z_{i-1},x_{i-1})) \\=\idotsint_{z_i} q(z_1|x_1)...q(z_i|z_{i-1},x_i) \log \frac{q(z_1|x_1)...q(z_i|z_{i-1},x_i)}{p(z_1),...,p(z_i|z_{i-1},x_{i-1})} \\=\mathbb E_{z_i \sim q(z_1|x_1)...q(z_i|z_{i-1},x_i)}\log \frac{q(z_1|x_1)...q(z_i|z_{i-1},x_i)}{p(z_1),...,p(z_i|z_{i-1},x_{i-1})} \\\text{if sampling,then}\\\text{Draw }z_i^l \sim q(z_1|x_1)...q(z_i|z_{i-1},x_i) \text{where } l=1,2,...,L\\\text{ s.t. } z_1^l \sim q(z_1|x_1) \text{ then } z_2^l \sim q(z_2|z_{1},x_2) \text{ and eventually } z_n^l \sim q(z_n|z_{n-1},x_n) \\\text{else}\\=D_{KL}(q(z_1|x_1)||p(z_1))+\sum_{i=2}^{n} \mathbb E_{z_{i-1} \sim q(z_{i-1})}[D_{KL}(q(z_i|z_{i-1},x_i)||p(z_i|z_{i-1},x_{i-1}))]$$

  • $q(z_1|x_1),...,q(z_i|z_{i-1},x_i) \text{ and } p(z_1),...,p(z_i|z_{i-1},x_{i-1})) \text{are 2 set of learning parameters}\\\text{ can be represented as: }\\$
    $\begin{cases}\mathcal{N}(f^{mlp}_{\mu}(H^{enc2lat \phi}_{i}),f^{mlp}_{\sigma}(H^{enc2lat \phi}_{i})) & \quad \text{when Learning } q(z_1|x_1),...,q(z_i|z_{i-1},x_i)\\\mathcal{N}(f^{mlp}_{\mu}(H^{enc2lat \theta}_{i}),f^{mlp}_{\sigma}(H^{enc2lat \theta}_{i})) & \quad \text{when Reconstructing } p(z_1),...,p(z_i|z_{i-1},x_{i-1})\end{cases}$

Extra

$$p(x_1,x_2,z_1,z_2)=p(x_2|z_2)p(z_2|z_1,x_1)p(x_1|z_1)p(z_1)\\q(x_1,x_2,z_1,z_2)=p(x_2|z_2)q(z_2|z_1,x_2)p(x_1|z_1)q(z_1|x_1)\\p(z_2)=\frac{p(x_1,x_2,z_1,z_2)}{p(x_1,x_2,z_1|z_2)}\\p(z_2)=\sum_{x_2}\sum_{x_1}\int_{z_1} p(x_1,x_2,z_1,z_2)\\p(z_2)=\sum_{x_2} p(x_2|z_2)\sum_{x_1}\int_{z_1} p(z_2|z_1,x_1)p(x_1|z_1)p(z_1)\\q(z_2)=\sum_{x_2} \int_{z_1}p(x_2|z_2)q(z_2|z_1,x_2)\sum_{x_1}p(x_1|z_1)q(z_1|x_1)\\$$

Reconstruct Objective

  • Average per-word perplexity : $e^{-\frac{1}{N}\sum_{i=1}^{N} \ln y_{pred}}$

  • Average word/char log perplexity across over the batch size :
    $$\begin{equation}\begin{split}D_{Reconstruct}(X,\hat X) &\\&=\frac {\displaystyle\sum_{i=1}^{N} \sum_{j=1}^{M} (\sum_k^{K}-\log [f_{softmax}(logits_{i,j})] \times y_k) \times \text{mask}_{i,j}}{N \times M'} \\&=\frac {\displaystyle\sum_{i=1}^{N} \sum_{j=1}^{M} (-\log [f_{softmax}^{label}(logits_{i,j})] \times y_{label}) \times \text{mask}_{i,j}}{N \times M'}\\&=\frac {\displaystyle\sum_{i=1}^{N} \sum_{j=1}^{M} (-\log [f_{softmax}^{label}(logits_{i,j})] \times 1) \times \text{mask}_{i,j}}{N \times M'}\\&=\frac {\displaystyle\sum_{i=1}^{N} \sum_{j=1}^{M} -\log [f_{softmax}^{label}(logits_{i,j})] \times \text{mask}_{i,j}}{N \times M'}\end{split}\end{equation}$$

KL Objective

$$\begin{equation}\begin{split}D_{KL}(q(z_i|z_{i-1},x_i)||p(z_1,...,z_i|z_{i-1})) &\\&=\idotsint_{z_i} q(z_1|x_1)...q(z_i|z_{i-1},x_i) \log \frac{q(z_1|x_1)...q(z_i|z_{i-1},x_i)}{p(z_1),...,p(z_i|z_{i-1})} \\&=\idotsint_{z_i} q(z_1|x_1)...q(z_i|z_{i-1},x_i) \log \frac{q(z_1|x_1)}{p(z_1)}+\sum_{i=2}^{n}\idotsint_{z_i} q(z_1|x_1)...q(z_i|z_{i-1},x_i) \log \frac{q(z_i|z_{i-1},x_i)}{p(z_i|z_{i-1})} \\&=\int_{z_1}q(z_1|x_1) \log \frac{q(z_1|x_1)}{p(z_1)}+\sum_{i=2}^{n}\int_{z_{i-1}}\int_{z_i}q(z_i|z_{i-1},x_i) \log \frac{q(z_i|z_{i-1},x_i)}{p(z_i|z_{i-1})} \\&=D_{KL}(q(z_1|x_1)||p(z_1))+\sum_{i=2}^{n} \mathbb E_{z_{i-1}}[D_{KL}(q(z_i|z_{i-1},x_i)||p(z_i|z_{i-1}))]\\\text{ if } p(z_1) \sim \mathcal{N}(0,I) \text{ , then}\\&=D_{KL}(q(z_1|x_1)||\mathcal{N}(0,I))+\sum_{i=2}^{n} \mathbb E_{z_{i-1}}[D_{KL}(q(z_i|z_{i-1},x_i)||p(z_i|z_{i-1}))]\end{split}\end{equation}$$

  • KL divergence between two multivariate Gaussians : $D_{KL}(q||p)=\frac{1}{2} (\log \frac{|\Sigma_p|}{|\Sigma_q|} -d+tr(\Sigma^{-1}_p\Sigma_q)+(\mu_p-\mu_q)^{T}\Sigma^{-1}_p(\mu_p-\mu_q))$

  • If $\forall \text{ dim} \in z_i \text{ is} \perp$ and let $\Sigma$ be a vector($\vec \sigma$) on the diagonal matrix,then we have:
    $D_{KL}(q_i||p_i)=\frac{1}{2} (\sum\log \frac{\sigma_{p_i}}{\sigma_{q_i}} -d+\sum \frac{\sigma_{q_i}}{\sigma_{p_i}})+\frac{(\mu_{p_i}-\mu_{1_i})^{T}(\mu_{p_i}-\mu_{q_i})}{\sigma_{p_i}})$

  • $D_{KL}(q||p)=D_{KL}(q||p) \otimes M$ where M is the mask cost matrix

  • We want, $p(z_1)\approx\begin{cases}\mathcal{N}(\vec{\mu_1}_{empirical},\vec{\sigma_1}_{empirical}) \text{ or } \mathcal{N}(0,I)& \quad \text{when Reconstruct}\\\mathcal{N}(f^{mlp}_{\mu}(H^{enc2lat}_{1}),f^{mlp}_{\sigma}(H^{enc2lat}_{1})) & \quad \text{when Learning}\end{cases} \\ p(z_i|z_{i-1}) \approx q(z_i|z_{i-1},x_i)=\mathcal{N}(f^{mlp}_{\mu}(H^{enc2lat}_{i}),f^{mlp}_{\sigma}(H^{enc2lat}_{i}))$