\documentclass[11pt]{article}
\usepackage{latexsym}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{epsfig}
\usepackage[right=0.8in, top=1in, bottom=1.2in, left=0.8in]{geometry}
\usepackage{setspace}
\usepackage{listings}
\spacing{1.06}
\lstset{
basicstyle=\ttfamily,
mathescape
}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{\vspace{0.25cm}
\hbox to 5.78in { {COMS E6998-9:\hspace{0.12cm}Algorithmic
Techniques for Massive Data} \hfill #2 }
\vspace{0.48cm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{0.42cm}
\hbox to 5.78in { {#3 \hfill #4} }\vspace{0.25cm}
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribes:\hspace{0.08cm}#4}{Lecture #1}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{example}[theorem]{Example}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{assumption}[theorem]{Assumption}
\newcommand{\E}{\mathbb{E}}
\newcommand{\Var}{\text{Var}}
\begin{document}
\lecture{3 -- Frequency Moments, Heavy Hitters }{Sep 15, 2015}{Instructor:\hspace{0.08cm}\emph{Alex Andoni}}{\emph{Daniel Alabi, Wangda Zhang}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% The following is scribed by Daniel %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction}
This lecture is about the second frequency moment and heavy
hitters. First, we present the Tug-of-War algorithm
by Alon-Matias-Szegedy to obtain a
$1 + \epsilon$ approximation using $O(\frac{1}{\epsilon}\log n)$
space. Second, we define heavy hitters and present the CountMin
algorithm which can be used to obtain heavy hitters.
\section{Second Frequency Moment}
Assume -- as in previous lectures -- that we are given a stream of
length $m$ from which we want to obtain a
frequency vector
$f = (f_1, f_2, \ldots, f_n)$
($n$ distinct elements)
where $f_i$ is the frequency of the $i$th distinct element in
the stream, then the $k$th frequency moment of the stream is
\begin{equation}
F_k = \sum_{i=1}^{n}f_i^k
\end{equation}
In Lectures 1 and 2, we estimated $F_0$ and $F_1$ corresponding
to the number of distinct elements and the length of the
stream respectively. Now we wish to obtain the second moment:
\begin{equation}
F_2 = \sum_{i=1}^{n}f_i^2
\end{equation}
The idea is to use i.i.d random variables
$r_i (= r(i))$
where $P(r_i = 1) = P(r_i = -1) = \frac{1}{2}$
(Rademacher random variables) to get an estimator.
Thus,
$\E[r_i] = 0$ and $\Var[r_i] = 1$
(since $r_i^2 = 1$ and $\Var[r_i] = E[r_i^2] - (E[r_i])^2$).
We can think of $r_i$ as a random variable defined for
every distinct element so that
\begin{equation}
r: [n] \rightarrow \{-1, +1\}
\end{equation}
As an example, let's consider when the entries of the
frequency vector is all 1s ($\Rightarrow m = n$). Let
\begin{equation}
z = \sum r_i
\end{equation}
Then
$\E[z] = \sum\E[r_i] = 0$
by linearity of expectation and
using the distribution of $r_i$. Similarly,
$\Var[z] = \Var[\sum r_i] = m$. We can apply Chebyshev to obtain that
$|z-0| = |z| \leq O(\sqrt{m})$ with constant probability.
Turns out that this bound is tight.
Now, let's consider the more general case and define
\begin{equation}
z = \sum_i r_i \cdot f_i
\end{equation}
\begin{claim}
$\E[z^2] = \sum_{i=1} f_i^2$ = $F_2$
\end{claim}
\begin{proof}
\begin{align}
\E[z^2] &= \E[(\sum_{i=1}^n r_i f_i)^2] \\
&= \sum_i\E[r_i^2]f_i^2 + \sum_{i \neq j}\E[r_i]\E[r_j]f_i f_j \\
&= \sum_i f_i^2\E[r_i^2] \\
&= \sum_i f_i^2 = F_2
\end{align}
(7) holds because for $i \neq j$, $r_i$ and $r_j$ are
independent, while (8) holds because $\E[r_i] = 0$ for all $i$.
Finally, (9) holds because $r_i^2 = 1 (\Rightarrow \E[r_i^2] = 1)$.
\end{proof}
\begin{claim}
$\Var[z^2] \leq O(1) \cdot F_2^2$
\end{claim}
\begin{proof}
First, let's compute $\E[z^4]$
\begin{align}
\E[z^4] &= \E[(\sum_i r_i f_i)^4] \\
&= \E[\sum_{i, j, k, l} r_i f_i r_j f_j r_k f_k r_l f_l] \\
&= \sum_{i, j, k, l} f_i f_j f_k f_l\E[r_i r_j r_k r_l] \\
&= \sum_i f_i^4 + 6\sum_{i < j} f_i^2 f_j^2 \\
&\leq O(1) \cdot (\sum_i f_i^2)^2
\end{align}
(13) holds because:
\begin{itemize}
\item Of independence of the random variables
\item The terms with odd powers of $r_i$ evaluate to zero
\item There are ${4 \choose 2}$ = 6 terms of the form $r_i^2r_j^2$
\end{itemize}
Finally, we obtain that
$\Var[z^2] \leq \E[z^4] \leq O(1) \cdot F_2^2$
\end{proof}
Having bounded $\E[z^2]$ and $\Var[z^2]$, we present the algorithm
below
\subsection{Tug-of-War (Alon-Matias-Szegedy 1996)}
We maintain a counter $z$.
\begin{enumerate}
\item Initialize $z = 0$
\item When we see element $i$: $z = z + r(i)$
\item Return the estimator $z^2$
\end{enumerate}
Earlier, we defined $r_i$ in the form
$r: [n] \rightarrow \{-1, +1\}$ where $r_i$ acts like a hash
function. The hash function for the $r_i$s should be 4-wise
independent.
Next, we apply the ``average trick" using
$k = O(\frac{1}{\epsilon^2})$
parallel runs of the algorithm to obtain a
$1 + \epsilon$ approximation in $O(\frac{1}{\epsilon^2}\log n)$
space.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% The following is scribed by Wangda %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Linearity}
So far we have only considered simple estimators, and we next consider something more complex. Suppose we have two parts of a stream seen by two different estimators, and we want to estimate the union of these two parts (i.e., how we can combine them).
\begin{claim}
Linearity: given estimates $z'$ for $f'$ and $z''$ for $f''$, we can combine them as $z=z'+z''$ for $f=f'+f''$.
\end{claim}
\begin{proof}
Since $z'=\sum r_if'_i$ and $z''=\sum r_if''_i$, we have $z'+z'' = \sum r_i(f'_i+f''_i)$. Note that we need to use the same randomness for two estimates.
\end{proof}
Similarly, we can use $(z'-z'')^2$ to estimate $\sum(f'_i - f''_i)^2$. However, we cannot use linearity in a similar way for $\sum|f'_i - f''_i|$, and will discuss this pointer later in class.
\subsection{General Streaming Model}
We now consider a more generalized model: at each moment, we have an update $(i, \delta_i)$ to increase the $i$-th entry by $\delta_i$. ($\delta_i$ may be negative)
A linear algorithm $S$ handles this easily, $S(f+e_i\delta_i) = S(f) + S(e_i\delta_i)$. We call S a sketch. According to [Nguyen-Li-Woodruff'14], any algorithm for general streaming might as well be linear.
\section{Heavy Hitters}
Now that we are able to compute many types of frequency counts, we wonder if we can also compute the max frequency in a stream. It turns out that we cannot: it is impossible to approximate the max frequency in sublinear space. Therefore, we will solve a more modest problem where we want to detect the max-frequency element if it is very heavy. We will show that we can find these heavy hitters in space $O(1/\phi)$.
\begin{definition}
$i$ is $\phi$-heavy if $f_i \ge \phi\sum_jf_j$.
\end{definition}
The basic idea is still to use hash functions. A first-attempt method uses a single hash function $h$ mapping from $[n]$ to $[w]$ randomly, where $w = O(1/\phi)$. Then each element $i$ goes to bucket $i$, and we sum up the frequencies in each of the $w$ buckets. We denote the sum of each bucket as $S$. So the estimator for $f_i$ is $\hat{f} = S(h(i))$.
For example, consider a stream of 2, 5, 7, 5, 5. If the hash function $h_1$ works as $h_1(2)=2, h_1(5)=1, h_1(7)=2$, then we will obtain the following estimates: $\hat{f_2}=2, \hat{f_5}=3, \hat{f_7}=2$. However, for an element that never appears in the stream, e.g. 11, this method also estimates its frequency as $\hat{f_{11}}=2$, assuming $h_1(11)=2$.
\begin{claim}
$S(h(i))$ is a biased estimator.
\end{claim}
\begin{proof}
Analyzing this estimator, we have $\hat{f_i} = S(h(i)) = f_i + \sum_{\{j:h(j)=h(i)\}}f_j$. Let $C = \sum_{\{j:h(j)=h(i)\}}f_j$. Thus,
\begin{equation*}
\E[C] = \sum_jPr[h(j)=h(i)]\cdot f_j = \sum_{j\neq i}\frac{f_j}{w} \neq 0
\end{equation*}
\end{proof}
However, it is easy to see that $\E[C] \le \frac{\sum_j f_j}{w}$. So the bias is at most $\sum_j f_j/w$, which is small for $f_i \gg \sum_j f_j/w$. By Markov Inequality, we have $C le \frac{10\sum_j f_j}{w}$ with at least 90\% probability, i.e. $$Pr[\hat{f_i} - f_i < O\left(\frac{\sum_j f_j}{w}\right)] \ge 0.9$$
For $w=O\left(\frac{1}{\epsilon\phi}\right)$ and $f_i \ge \phi\sum_j f_j$, we have $C \le \epsilon f_i$. That is, $\hat{f_i}$ is a $(1+\epsilon)$ approximation (with 90\% probability).
Still, there are two issues with this estimator: (1) only constant probability; (2) overestimate for many indices (10\%). Fundamentally, there is a conflict between avoiding many collisions and reducing space used by the hash table.
\subsection{CountMin}
This motivates us to use the ``median trick". We can use $L=O(\log n)$ hash tables with hash functions $h_j$. The CountMin algorithm works as follows:
\begin{lstlisting}
Initialize(r, L):
array S[L][w]
L hash functions $h_1, \dots, h_L$, into $\{1, \dots, w\}$
Process(int i):
for (j = 0; j < L; ++j)
S[j][$h_j(i)$] += 1
Estimator:
foreach i in PossibleIP:
$\hat{f_i}$ = $median_j$(S[j][$h_j(i)$]
\end{lstlisting}
\begin{claim}
The median is a $\pm\epsilon\phi$ estimator with $(1-1/n^2)$ probability.
\end{claim}
\begin{proof}
For an index $i$, each row (out of the $L$ rows) gives $\hat{f_i} = f_i \pm \epsilon\phi$ with 90\% probability. By Median Trick (see Lecture 2), the median gives an estimator of $\pm\epsilon\phi$ with $(1-1/n^2)$ probability.
\end{proof}
Alternatively, we can take the min, instead of median, since all counts are overestimated.
\subsection{Output Heavy Hitters}
We can now identify the heavy hitters by iterating over all $i$'s and outputing those with $\frac{\hat{f_i}}{\sum_j f_j} \ge \phi$. In particular, for true frequencies $f_i$s,
\begin{itemize}
\item if $\frac{f_i}{\sum_j f_j} \le \phi(1-\epsilon)$, then $i$ is not in output;
\item if $\frac{f_i}{\sum_j f_j} \ge \phi(1+\epsilon)$, then $i$ is reported as a heavy hitter;
\item if $\phi(1-\epsilon) \le \frac{f_i}{\sum_j f_j} \le \phi(1+\epsilon)$, then $i$ may or may not be reported as a heavy hitter.
\end{itemize}
If we really care about those elements in between, then we could take more space to further narrow the gap.
The space used is $O\left(\frac{\log^2 n}{\epsilon\phi}\right)$ bits. Since we iterate over all $i$'s, the time complexity is $\Omega(n)$.
We can improve this time complexity, at the cost of increasing space to $O\left(\frac{\log^3 n}{\epsilon\phi}\right)$ bits. The idea is to use dyadic intervals. For each level, we maintain its own sketch, and find the heavy hitters by following down the subtrees of heavy hitters in intermediary.
\end{document}