\documentclass[11pt]{article}
\usepackage{latexsym}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{epsfig}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage[right=0.8in, top=1in, bottom=1.2in, left=0.8in]{geometry}
\usepackage{setspace}
\spacing{1.06}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{\vspace{0.25cm}
\hbox to 5.78in { {COMS E6998-9:\hspace{0.12cm}Algorithmic
Techniques for Massive Data} \hfill #2 }
\vspace{0.48cm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{0.42cm}
\hbox to 5.78in { {#3 \hfill #4} }\vspace{0.25cm}
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribes:\hspace{0.08cm}#4}{Lecture #1}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{example}[theorem]{Example}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{assumption}[theorem]{Assumption}
\begin{document}
\lecture{18 --- Distribution \& Monotonicity \& Graph Testing}{Nov 10, 2015}{Instructor:\hspace{0.08cm}\emph{Alex Andoni}}{\emph{Yuan Feng, Vishal Juneja}}
\section{Introduction}
In today's lecture, we firstly finished proof for Uniformity Testing lemma. Then we went through other property testings, including Identity Testing, Equality Testing, Independence Testing, etc. In detail, we discussed Monotonicity Testing algorithm in sub-linear time, with the approximation note $\epsilon-$far. Then we see how to use such an approximation in Sparse Graph Testing.
\section{Uniformity Testing}
Remember in last lecture we prove the lemma with 2 claims:
\begin{lemma}
$\frac{C}{M}$ allows us to distinguish between the two cases above, as long as $m = \Omega(\frac{\sqrt{n}}{\epsilon^4})$. $M$ is the normalization constant $M=\binom{m}{2}$.
\end{lemma}
\noindent
We let $d = \|D\|_2^2$, and have the two claims:
\begin{claim}
$E\left[ \frac{C}{M} \right]=d$
\end{claim}
\begin{claim}
$Var\left[ \frac{C}{M} \right]\le\frac{d}{M}+\frac{8d^2\sqrt{n}}{m}$
\end{claim}
\noindent
Now let's finish the proof for the lemma:
\begin{proof}
We have $\frac{1}{n}$ as d's lower bound
\begin{align*}
Var \left[ \frac{C}{M} \right] &\le\frac{d}{M}+\frac{8d^2\sqrt{n}}{m}\\
&\le \frac{d^2}{Md} + \frac{8\sqrt{n}}{m}\cdot d^2\\
&\le \frac{n}{M}\cdot d^2 + \frac{8\sqrt{n}}{m}\cdot d^2\\
&\le O(1)\cdot\left[ \epsilon ^8 d^2 + \epsilon ^4 d^2 \right]\\
&\le O(\epsilon ^4 d^2)
\end{align*}
$ $\newline
By Chebyshev's, this means with probability 90\%,
\noindent $ $\newline
If D is uniform distribution:
$ $\newline
\begin{align*}
\frac{C}{M}&\le d+O(\epsilon ^4 d^2)^\frac{1}{2}\\
&\le d+O(\epsilon ^2 d)\\
&\le \frac{1}{n}+\frac{O(\epsilon ^2)}{n}\\
&<\frac{1}{n}+\frac{\epsilon ^2}{2n}
\end{align*}
If D is not uniform distribution:
\begin{align*}
\frac{C}{M} &\ge d-O(\epsilon ^2 d)\\
&\ge d(1-O(\epsilon ^2))\\
&\ge \frac{1}{n}+\frac{\epsilon ^2}{2n}
\end{align*}
This corresponds to the algorithm:
\begin{algorithm}
\begin{algorithmic}
\IF{$C < \frac{am^2}{n}$}
\RETURN uniform
\ELSE
\RETURN nonuniform
\ENDIF
\end{algorithmic}
\end{algorithm}
\noindent $ $\newline
Note: choose 'a' as a quality constant in above alogorithm.
\end{proof}
\section{Identity Testing}
\textbf{Problem:}
$ $\newline
If we have known distribution $p$, and given samples from another unknown distribution \textit{q}, we need to distinguish between:
$p=q$ vs. $\|p-q\|_1 \ge \epsilon$.
Note that uniformity Testing is just an instance of this, where $p=U_n$. Another example is suppose if we want to test is one 'q' is a binomial distribution or not.
$ $\newline
By the classic $\chi ^ 2$ test [Pearson 1900], we let $X_i$ to be the number of occurrences of $i$:
\begin{align*}
\sum\limits_i \frac{(X_i-kp_i)^2-kp_i}{p_i} \ge \alpha
\end{align*}
Where $k$ is the total count of samples, the same as $m$ in Uniformity Testing. When $k$ is very large, we can have $E[X_i]=kp_i$.
So in this case only $O(n)$ samples are needed, which gives better bound, but not optimal.
$ $\newline
The test of [Valiant, Valiant 2014] gives better bound $O(\sqrt{n})$, which is near to be optimal:
\begin{align*}
\sum\limits_i \frac{(X_i-kp_i)^2-X_i}{p_i^\frac{2}{3}} \ge \alpha
\end{align*}
\section{More properties in Distribution Testing}
\subsection{Equality Testing}
\textbf{Problem:} Given samples from unknown distributions $p, q$, distinguish: $p=q$ vs. $\|p-q\|_1 \ge \epsilon$.
$ $\newline
\textbf{Sample bound:} $\Theta_\epsilon (n^\frac{2}{3})$
\subsection{Independence Testing}
\textbf{Problem:} Given samples from $(p, q)\in [n]\times [n]$, distinguish: $p$ is independent of $q$ vs. $\|(p-q)-A\times B\|_1 \ge \epsilon$ for any distribution $A, B$ on $[n]$.
$ $\newline
\textbf{Sample bound:} $\widetilde{\Theta_\epsilon}(n^\frac{4}{3})$
$ $\newline
$ $\newline
Since the input space is $O(n^2)$ so the above algorithm is still sub-linear.
\subsection{More}
There are more properties testing. Some of them use linear programming.
\section{Monotonicity Testing}
This is a little bit different set-up:
$ $\newline
\textbf{Problem:} Given a sequence $x_1,...x_n$, distinguish between:
the sequence is sorted vs. the sequence is NOT sorted.
$ $\newline
$ $\newline
We want this to be done in sub linear time $o(n)$, which is impossible because there can be just one inversion somewhere. Hence we need some notion of approximation. Here we introduce the concept of $\epsilon-$far.
$ $\newline
$\boldsymbol{\epsilon-far}$ in Monotonicity Testing is defined as:
two sequences are $\epsilon-$far if we need to delete $\epsilon$ fraction of elements from one to convert it into a sorted sequence.
\subsection{Testing}
Randomly sample positions i, then check: $x_ix_m$, we recurse on the right
\end{itemize}
We only need to repeat $\frac{3}{\epsilon}$ iterations to distinguish if the sequence is sorted or not with sufficient guaranty. The intuition is to identify cases of violation in an iteration. Note that in this process, violation occurs or sequence fails if:
\begin{enumerate}
\item some $x_i$ is not where it should be
\item some $x_m \notin [x_s, x_t]$
\end{enumerate}
And if there is no failure in any repetition, we will argue that the sequence is sorted.
\subsection{Analysis}
If the sequence pass the algorithm, it is sorted. We will argue it to be $\epsilon-$far sorted via contrapositive.
\begin{lemma}
If one iteration succeeds with probability $\ge \epsilon$, then sequence $\le$ $\epsilon$ far from a sorted sequence. So we need only $\frac{3}{\epsilon}$ iterations to catch the case when sequence is $> \epsilon$ sorted with probability at least 90$\%$.
\end{lemma}
\begin{proof}
We call $i\in [n]$ good if it passes the test
\begin{claim}
if $i 1$,
$ $\newline
$\forall G$ with $deg \le d$ is $\epsilon d-$close to a completed graph.
$ $\newline
If we add a spanning tree, we will have $n-1\le \epsilon m =$ $\epsilon dn$
\end{proof}
\subsection{Algorithm}
For each iteration: we randomly select a node $s$, and do BFS from s, until we see more than $\frac{4}{\epsilon d}$ node in the connected components (CC). If the CC's size is smaller than that, report "disconnected."
$ $\newline
We repeat for $r=O(\frac{1}{\epsilon d})$ times then report "connected".
\subsection{Analysis}
\begin{claim}
If the graph is $\epsilon-$far connected, it has at most $\Omega (\epsilon d n)$ connected components.
\end{claim}
\begin{proof}
Suppose G has c connected components, it will connect using $O(c)$ modifications, by just connecting each connected component consecutively.
$ $\newline
\textbf{Special case:} a CC can have all its nodes with full degree. In this case, we can delete one edge within the CC and add edge between the CC and another. So for the worst case when every CC is of full degree, we still need $O(c) = 2c$ modifications.
\end{proof}
$ $\newline
So, on average a CC has $O(\frac{n}{\epsilon d n})=O(\frac{1}{\epsilon d})$ nodes. We can pick one of them with probability at least $\epsilon d$.
\end{document}