

import re
import os, errno

## A function to get all of the filepaths
def filepaths(top_path):
    for dirpath, subdirs, files in os.walk(top_path):
        for f in files:
            yield f, os.path.join(dirpath, f)


inpath = "C:/Users/Richard Nielsen/Desktop/Professional Stuff/talks/text analysis talk/sss_html/"
outpath = "C:/Users/Richard Nielsen/Desktop/Professional Stuff/talks/text analysis talk/sss_posts/"

## start a csv file for the data
csv_path = "C:/Users/Richard Nielsen/Desktop/Professional Stuff/talks/text analysis talk/sss_data.csv"
f = open(csv_path, "w")
f.writelines("filename,author,poster,posttime,date\n")
f.close()

## get the file paths for the html
fp = filepaths(inpath)

for name, path in fp:
    print name
    ## get the html
    f = open(path)
    html = f.read()
    f.close()
    ## cut off the header and footer
    html = html.split("<div class=\"BlogContent\">")[1].split("<div id=\"footer\">")[0]
    ## split out the blog posts    
    posts = re.split("<h3 id=", html)[1:]
    
    ## get dates
    dates = re.split("<h3 id=", html)[:-1]
    dates2 = [x[-100:-1] for x in dates]
    dates3= [re.findall("[0-9]{1,2} .{1,10} [0-9]{4}", x) for x in dates2]
    dates4 = []
    for d in dates3:
        if len(d)>0:
            dates4.append(d[0])
        else:
            dates4.append("")

    if len(posts)!=len(dates4):
        print "len(posts)!=len(dates)"
        print stopme
    ## for each post, get the author and title
    authors=[]
    ptext = []
    posters = []
    posttimes = []
    for p in posts:
        #print p
        ## early posts listed an author sometimes
        if re.search("</h3>\\n\\n<p><strong>.*?</strong></p>", p):
            au = re.findall("</h3>\\n\\n<p><strong>.*?</strong></p>", p)[0]
            au = re.sub("\\n","",re.sub('\<.*?>', ' ', au)).strip()
            ## there are some commas in the author field
            au = re.sub(",","",au)
            authors.append(au)
            posttext = re.sub("</h3>\\n\\n<p><strong>.*?</strong></p>"," ", p)
        else:
            posttext = p
            authors.append("")
        ## get the posted by name
        poster = p.split("<p class=\"posted\">")[1].split("</a>")[0].split("Posted by")[1]
        poster = re.sub('\<.*?>', ' ', poster).strip()
        posters.append(poster)
        ## get the post time
        posttime = p.split("<p class=\"posted\">")[1].split("</a> at ")[1]
        posttime = re.findall("[0-9]{1,2}:[0-9]{2} [AP]{1}M", posttime)[0]
        posttimes.append(posttime)

        ## get the post text
        posttext = re.split("class=\"posted\"",posttext)[0]
        posttext = "<"+posttext+">"
        posttext = re.sub('\<.*?>', ' ', posttext).strip()

        if re.search(">>>>",posttext):
            print whwwwee

        ptext.append(posttext)

    if len(ptext)!=len(dates):
        print "len(posts)!=len(dates)"

    ## put everything in a spreadsheet or flat text file
    for i in xrange(len(posts)):
        ## output the text to a file
        outfilename = re.sub(".htm","",name)+"_"+str(i)+".txt"
        f = open(outpath+outfilename, "w")
        f.writelines(ptext[i])
        f.close()
        ## save the data in a csv
        f = open(csv_path, "a")
        f.writelines(outfilename+","+authors[i]+","+posters[i]+","+posttimes[i]+","+dates4[i]+"\n")
        f.close()
        
print "done"


