\documentclass[11pt]{article}
\usepackage{latexsym}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{epsfig}
\usepackage[right=0.8in, top=1in, bottom=1.2in, left=0.8in]{geometry}
\usepackage{setspace}
\spacing{1.06}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{mleftright, xspace,hyperref, todonotes}
% Probability
\newcommand{\proba}{\Pr}
\newcommand{\probaOf}[1]{\proba\!\left[\, #1\, \right]}
\newcommand{\probaCond}[2]{\proba\!\left[\, #1 \;\middle\vert\; #2\, \right]}
\newcommand{\probaDistrOf}[2]{\proba_{#1}\left[\, #2\, \right]}
% Support of a distribution/function
\newcommand{\supp}[1]{\operatorname{supp}\!\left(#1\right)}
% Expectation & variance
\newcommand{\expect}[1]{\mathbb{E}\!\left[#1\right]}
\newcommand{\expectCond}[2]{\mathbb{E}\!\left[\, #1 \;\middle\vert\; #2\, \right]}
\newcommand{\shortexpect}{\mathbb{E}}
\newcommand{\var}{\operatorname{Var}}
% Distributions
\newcommand{\uniform}{\ensuremath{\mathcal{U}}}
\newcommand{\uniformOn}[1]{\ensuremath{\uniform\!\left( #1 \right) }}
\newcommand{\geom}[1]{\ensuremath{\operatorname{Geom}\!\left( #1 \right)}}
\newcommand{\bernoulli}[1]{\ensuremath{\operatorname{Bern}\!\left( #1 \right)}}
\newcommand{\bern}[2]{\ensuremath{\operatorname{Bern}^{#1}\!\left( #2 \right)}}
\newcommand{\binomial}[2]{\ensuremath{\operatorname{Bin}\!\left( #1, #2 \right)}}
\newcommand{\poisson}[1]{\ensuremath{\operatorname{Poisson}\!\left( #1 \right) }}
\newcommand{\gaussian}[2]{\ensuremath{ \mathcal{N}\!\left(#1,#2\right) }}
\newcommand{\gaussianpdf}[2]{\ensuremath{ g_{#1,#2}}}
\newcommand{\betadistr}[2]{\ensuremath{ \operatorname{Beta}\!\left( #1, #2 \right) }}
% Norms
\newcommand{\norm}[1]{\lVert#1{\rVert}}
\newcommand{\normone}[1]{{\norm{#1}}_1}
\newcommand{\normtwo}[1]{{\norm{#1}}_2}
\newcommand{\norminf}[1]{{\norm{#1}}_\infty}
\newcommand{\abs}[1]{\left\lvert #1 \right\rvert}
\newcommand{\dabs}[1]{\lvert #1 \rvert}
\newcommand{\vect}[1]{\mathbf{#1}} % shortcut
% Sets and indicators
\newcommand{\setOfSuchThat}[2]{ \left\{\; #1 \;\colon\; #2\; \right\} } % sets such as "{ elems | condition }"
\newcommand{\indicSet}[1]{\mathbf{1}_{#1}} % indicator function
\newcommand{\indic}[1]{\indicSet{\left\{#1\right\}}} % indicator function
\newcommand{\disjunion}{\amalg}%\coprod, \dotcup...
\newcommand{\eps}{\ensuremath{\varepsilon}\xspace}
\newcommand{\eqdef}{\stackrel{\rm def}{=}}
% Complexity
\newcommand{\littleO}[1]{{o\mleft( #1 \mright)}}
\newcommand{\bigO}[1]{{O\mleft( #1 \mright)}}
\newcommand{\bigTheta}[1]{{\Theta\mleft( #1 \mright)}}
\newcommand{\bigOmega}[1]{{\Omega\mleft( #1 \mright)}}
\newcommand{\tildeO}[1]{\tilde{O}\mleft( #1 \mright)}
\newcommand{\tildeTheta}[1]{\operatorname{\tilde{\Theta}}\mleft( #1 \mright)}
\newcommand{\tildeOmega}[1]{\operatorname{\tilde{\Omega}}\mleft( #1 \mright)}
\providecommand{\poly}{\operatorname*{poly}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{\vspace{0.25cm}
\hbox to 5.78in { {COMS E6998-9:\hspace{0.12cm}Algorithmic
Techniques for Massive Data} \hfill #2 }
\vspace{0.48cm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{0.42cm}
\hbox to 5.78in { {#3 \hfill #4} }\vspace{0.25cm}
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribes:\hspace{0.08cm}#4}{Lecture #1}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{example}[theorem]{Example}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{assumption}[theorem]{Assumption}
\begin{document}
\lecture{7 -- Dynamic sampling, Dimension Reduction}{Sep 29, 2015}{Instructor:\hspace{0.08cm}\emph{Alex Andoni}}{\emph{Cl\'ement Canonne}}
\section{Dynamic Sampling (and applications)}
Recall that in the previous lecture, we introduced the following subproblem as building block for an approach to solve the \emph{connectivity} problem on dynamic graph streams:
\subsection{Dynamic Sampling}
\begin{description}
\item[Stream:] general updates to a vector $x\in\{-1,0,+1\}^n$
\item[Goal:] output $X\in[n]$ with $\probaOf{X=i} = \frac{\abs{x_i}}{\sum_{j=1}^n \abs{x_j}}$.
\end{description}
Before describing and analyzing an algorithm achieving this goal, we describe (some) intuition behind it by considering the following two extreme cases, where we let $D\eqdef\setOfSuchThat{i\in [n]}{x_i \neq 0}$:
\begin{itemize}
\item If $\abs{D}=10$, then each $x_i\neq 0$ is a $\frac{1}{10}$-heavy hitter: we can use \textsc{CountSketch} to recover all of them using a total of $\bigO{\log n}$ space.\footnote{This actually leverages some extra guarantees of \textsc{CountSketch}, not explicitly stated in the previous lectures. Namely, defining for a vector $\vect{x}$ its \emph{tail} $\vect{x}^{\rm tail}$ as $\vect{x}$ restricted to the set of indices that do not belong to the top-$k$ coordinates (where in the previous lectures we had $k=1/\phi$), then for all $i\in[n]$ \textsc{CountSketch} an estimate of $\vect{x}_i$ accurate to within an additive $\pm\normtwo{\vect{x}^{\rm tail}}/k$.}
\item If $\abs{D} = 10\sqrt{n}$, we can \emph{downsample}. That is, we can first obtain a random subset $S\subseteq[n]$ by including independently each coordinate $i\in[n]$ with probability $1/\sqrt{n}$ (so that $\abs{S}\approx \sqrt{n}$), and then focus on the substream involving indices $i\in S$. (Note that this allows to reduce to the previous case, since $\expect{\abs{D\cap S}} = 10$, and with high probability we should actually have $\abs{D\cap S}\approx 10$.)
\end{itemize}
The actual algorithm will in some sense \emph{interpolate} between these cases, by considering all possibles ``orders of magnitude'' for $\abs{D}$ (and focusing on the one that works.)
\subsubsection{Basic Sketch} We start by choosing a hash function $g\colon[n]\to\{0,\dots,n-1\}\equiv\{0,1\}^L$ (where $L\eqdef\log n$), and let $h\colon[n]\to[L]$ be the function defined by
\[
h(i) \eqdef \max\setOfSuchThat{k\in\{0,\dots,L\}}{ \exists x\in\{0,1\}^\ast,\ g(i) = x0^k }, \qquad i\in [n].
\]
(That is, $h(i)$ is the number of tail zeros in the binary expansion of $g(i)$: if $g(i) = 01000100$, then $h(i)=2$). This implies that, over the randomness of the hash function $g$, for any $i\in[n]$
\[
\probaOf{h(i) = j} = \begin{cases}
\frac{1}{2^{j+1}} & \text{ for } 0\leq j\leq L-1 \\
\frac{1}{2^{L}} & \text{ for } j=L
\end{cases}
\]
where the first expression comes from the fact that $h(i) = j$ iff the last $j$ bits are $0$ \emph{and} the bit just before is $1$ (which happens with probability $2^{-j}\cdot \frac{1}{2}$In particular, $\sum_{j=0}^L \probaOf{h(i) = j} = 1$, as it should for a probability distribution.
The algorithm then partition the stream into $L+1$ substreams $I_0,\dots, I_L$, where the substream $I_j$ only includes the indices $i\in[n]$ such that $h(i)=j$. The crucial observation here is that this implies that
\[
\expect{\abs{D\cap I_j}} = \frac{\abs{D}}{2^{j+1}}, \qquad 0\leq j\leq L-1
\]
i.e. ``stream $I_j$ corresponds to downsampling with probability $1/2^{j+1}$.''
\paragraph{Sketch.} For each $0\leq j\leq L$:
\begin{itemize}
\item Store $CS_j$: \textsc{CountSketch} on $I_j$, with parameter $\phi\eqdef \frac{1}{100}$;
\item Store $DC_j$: \textsc{DistinctCountSketch} on $I_j$, with parameter $\eps\eqdef \frac{1}{10}$ (for a $1.1$-approximation);
\end{itemize}
both with success probability $1-\frac{1}{n}$ (for a union bound over all streams and iterations) (which costs an extra $\bigO{\log n}$ factor in the space complexity).\footnote{Note that it will become apparent later in the analysis that $1-\frac{1}{10\log n}$ or so would be sufficient, if one wanted to do precise bookkeeping.} Note that for \textsc{DistinctCountSketch} we can use the \emph{linear} sketch \textsc{Tug-of-War} (which approximates the frequency moment $F_2$). Indeed, since each $f_i\in\{-1,0,1\}$, we get $\sum_{i}f_i^2 = \sum_{i} \indic{f_i\neq 0} = \abs{D}$.
\paragraph{Estimation.}
\begin{enumerate}
\item\label{step:1} Find a substream $I_j$ such that $DC_j\in[1,20]$: if none, output \textsf{fail}.
\item\label{step:2} Recover all $i\in I_j$ such that $x_i\neq 0$ (using $CS_j$).
\item\label{step:3} Output one of them uniformly at random.
\end{enumerate}
\paragraph{Analysis.} (We condition on all sketches $DC_j,CS_j$ computed in the sketching stage to meet their guarantee, which overall by a union bound over all $\bigO{L}$ sketches happens with probability $1-o(1)$.)
\begin{itemize}
\item First, if $0 < \abs{D} < 10$, then there exists some $j$ for which $DJ_j\in[1,11]$; thus, the algorithm does not output \textsf{fail} in the first step and the rest goes through.
\item We can therefore assume $\abs{D} \geq 10$. Let $k\geq 0$ be such that $\abs{D}\in[10\cdot 2^k, 10\cdot 2^{k+1})$; then,
\begin{equation}
\shortexpect\abs{D\cap I_k} = \frac{\abs{D}}{2^{k+1}} \in [5,10)
\end{equation}
and furthermore, setting $p\eqdef\probaOf{ i\in I_k } = \frac{1}{2^{k+1}}$ for convenience, one can compute the variance as follows:
\begin{align*}
\var \abs{D\cap I_k} &= \var \sum_{i\in D} \indic{i\in I_k} \\
&= \sum_{i\in D} \var \indic{i\in I_k} \tag{(Pairwise) independence}\\
&= \sum_{i\in D} p(1-p) \tag{Variance of a Bernoulli}\\
&= \abs{D}p(1-p) \leq \frac{\abs{D}}{2^{k+1}} \\
&\leq 10. \tag{by definition of $k$}
\end{align*}
Applying Chebyshev's inequality, we obtain that
\begin{align*}
\probaOf{ \abs{D\cap I_k} \notin [1,20] } &\leq \probaOf{ \abs{ \abs{D\cap I_k} - \shortexpect\abs{D\cap I_k} } > 4 } \\
&\leq \frac{\var \abs{D\cap I_k}}{16} \\
&\leq \frac{10}{16} = 0.625.
\end{align*}
Thus, we have $DC_k\in[1,20]$ with probability at least $0.375$. Conditioning on this event, the algorithm does not output \textsf{fail} in Step~\ref{step:1}: let then $j$ be any index such that $DC_j\in[1,20]$ (there is at least one such index, namely $k$). This, along with the setting of the parameter $\phi$, guarantees that $CS_j$ will recover a heavy hitter: that is, $i\in D\cap I_j$.
\end{itemize}
Finally, by symmetry, we can see that as long as the algorithm reaches Step~\ref{step:3} and outputs $i\in D\cap I_j$ for \emph{some} $j$, then $i$ is uniformly distributed over $D$. \todo[size=\scriptsize]{Does this need elaborating?}
\begin{observation}
As the analysis only relied on independence for the computation of the variance (to apply Chebyshev's inequality), a pairwise independent (family of) hash function(s) is sufficient for $g$.
\end{observation}
\paragraph{Guarantees.} The algorithm we described and analyzed above, \textsc{DynSampleBasic}, only offers the following guarantees:
\begin{itemize}
\item it fails with probability at most $0.625$ (and we \emph{know} when it does, as it explicitly outputs \textsf{fail});
\item whenever it does not fail, it outputs $i$ uniformly distributed in $D$ (modulo a negligible probability that either one of the $CS_j$'s or $DS_j$'s does not succeed).
\end{itemize}
To reduce the failure probability, one can do the usual trick: that is, taking independent copies. Below is the overall algorithm, \textsc{DynSampleFull}:
\begin{itemize}
\item Run $\ell=\bigO{\log n}$ independent copies of \textsc{DynSampleBasic}: with probability at least $1-(0.625)^\ell > 1-\frac{1}{n}$, at least one of them will not output \textsf{fail}.
\item The space needed overall is $\bigO{\log^4 n}$ words, for $\ell=\bigO{\log n}$ independent copies with each $L=\bigO{\log n}$ different substreams, all involving $\bigO{\log^2 n}$ space (for $CS_j$,$DS_j$ called with error parameter $1/n$).\footnote{Note that as hinted before, this can be reduced to $\bigO{\log^3 n\cdot \log\log n}$ words, setting the error probability of each $CS_j$,$DS_j$ to be only $1/\log n$.}
\end{itemize}
\subsection{Back to Dynamic Graphs (and Connectivity)}
Recall that in this setting, we are given the edges of a $n$-node graph $G=(V,E)$ as a stream of insertions or deletions. In particular, we can consider the following encoding of the graph, as \emph{node-edge incidence} vectors: to each $v\in V=[n]$ corresponds a vector $\vect{x}_v\in\mathbb{R}^p$ (for $p\eqdef \binom{n}{2}$), where
\begin{itemize}
\item for $j > v$, $\vect{x}_v(v,j) = \indic{(v,j)\in E}$;
\item for $j < v$, $\vect{x}_v(v,j) = -\indic{(v,j)\in E}$.
\end{itemize}
In particular, non-zero coordinates of $\vect{x}_v$ correspond to edges incident to $v$ (and the sign of $\vect{x}_v(v,j)$ is an ``artificial orientation'' we imposed to it).
\paragraph{Idea.} We can use dynamic sampling to sample uniformly an edge incident to each $v$. Then, we use these edges to \emph{collapse} the graph: replacing two nodes $u,v$ connected by an edge we sampled by a ``meta-node,'' combining the incident edges to both $u$ and $v$.
Ideally, we would like to iterate until either we are left with a single meta-node (in which case the graph was connected) or strictly more than one (in which case the graph was not). The issue, however, lies in this iteration: namely, how to sample edges incident to these ``meta-nodes,'', while we only have (streaming) access to the edges of the actual graph $G$?
The answer lies in the following crucial observation, which also explains the particular type of $\pm1$ encoding that was chosen for the vectors $\vect{x}_v$:
\begin{claim}\label{claim:useful:property}
For a set $Q\subseteq V$, define the node-edge incidence vector of the ``meta-node'' $Q$ as $\vect{x}_Q\eqdef \sum_{v\in Q} x_v\in\mathbb{R}^p$. Then, $\vect{x}_Q\in\{-1,0,1\}^V$, and moreover it has a non-zero component at coordinate $(i,j)$ if and only if edge $(i,j)$ exists and crosses from $Q$ to $V\setminus Q$. (That is, $(i,j)\in E$ and $\abs{\{i,j\}\cap Q}=1$.)
\end{claim}
\begin{proof}
If $(i,j)\notin E$, then $\vect{x}_Q(i,j)= \vect{x}_i(i,j) + \vect{x}_j(i,j) = 0+0=0$. Otherwise, assume $(i,j)\in E$: if $\abs{\{i,j\}\cap Q}=2$, then $\vect{x}_Q(i,j)= \vect{x}_i(i,j) + \vect{x}_j(i,j) = 1-1=0$. If $\abs{\{i,j\}\cap Q}=0$, then $\vect{x}_Q(i,j)=0$ immediately (no term in the sum with something in that coordinate). Only the case $\abs{\{i,j\}\cap Q}=1$ results in $\vect{x}_Q(i,j)=1$ or $\vect{x}_Q(i,j)=-1$, depending on which of $i,j$ belongs to $Q$.
\end{proof}
Another very useful property of these $\vect{x}_Q$: \emph{to compute them, we only need the sketches of the $\vect{x}_v$'s, since they are linear sketches.} Therefore, we can actually sample a random edge from $\vect{x}_Q$ (for any fixed $Q\subseteq V$), using only $\abs{V}\cdot\bigO{\poly\log \abs{V}} = \bigO{n\poly\log n}$ space!
\[
\textsc{DynSampleFull}\Big(\sum_{v\in Q} \vect{x}_v\Big) = \sum_{v\in Q} \textsc{DynSampleFull}(\vect{x}_v), \qquad \forall Q\subseteq V.
\]
An (almost correct) idea would therefore to do the following:
\begin{enumerate}
\item Initiate a sketch (of \textsc{DynSampleFull}) for each of the $n$ vectors $\vect{x}_v$
\item Check connectivity with the following recursive way, outputting \textsf{no} whenever one of the steps outputs \textsf{fail}, and \textsf{yes} if we reach a graph with only one (meta)-node:
\begin{enumerate}
\item sample an edge for each of the meta-nodes $v$ of the current graph;
\item contract all sampled edges, obtaining a smaller graph with meta-nodes corresponding to connected components of the previous one;
\item recurse on the new graph.
\end{enumerate}
\end{enumerate}
Since at every recursive step, the number of meta-nodes is easily seen to be reduced by a factor at least $2$, only $\bigO{\log n}$ such steps are needed overall.
However, the above has a big flaw: namely, the iterations (calls to $\textsc{DynSampleFull}$ on the ``new meta-nodes'') are \emph{not} independent, compromising correctness\dots (Indeed, the guarantees of~\textsc{DynSampleFull} do not apply if the queries are made on \emph{adaptively} chosen combinations of previous queries: an adversary could basically learn enough about the sketches to eventually query some $\vect{x}_Q$ on which \textsc{DynSampleFull} is ensured to fail.)
A simple solution: we can use \emph{new} independent copies of \textsc{DynSampleFull} at every iteration... only costing an $\bigO{\log n}$ blowup in the space required (since this is the number of iterations).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Dimension Reduction}
Barely scratched... next lecture.
\end{document}