查看完整版本: the python code for holyokemall/hampshiremall deal info

imac 2006-7-14 09:29 PM

the python code for holyokemall/hampshiremall deal info

[code]

#!/usr/local/bin/python

import datetime

def gendatestr(diff):
    today = datetime.date.today()
    delta = datetime.timedelta(days=diff)
    thisday = today + delta
    datestr = str(thisday.month)+"/"+str(thisday.day)+"/"+str(thisday.year)
    return datestr

import urllib

def crawler(baseurl, diff):
    request = baseurl + gendatestr(diff)
    try:
        conn = urllib.urlopen(request)
    except IOError:
        return False
    page = conn.read()
    conn.close()
    return page

import re

def parser(s):
        #strip comments
        reg = re.compile('<!--.*?-->', re.I | re.M | re.S)
        s = reg.sub("",s)
        #build few table list
        reg = re.compile('<table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" class=\"listheadertable\">.*?<\/table>', re.I | re.M | re.S)
        tblist = reg.findall(s)
        #build triplets
        triplets = []
        for ele in tblist:
                reg = re.compile('(?<=<td colspan="4" class="calendarlisttitle">).*?(?=<\/td>)',re.I | re.M | re.S)
                tname = reg.findall(ele)
                reg = re.compile('(?<=<td colspan="4" class="upcoming_datelist">).*?(?=<\/td>)',re.I | re.M | re.S)
                ttime = reg.findall(ele)
                reg = re.compile('(?<=<td colspan="4">).*?(?=<\/td>)',re.I | re.M | re.S)
                tcontent = reg.findall(ele)
                triplets = triplets + [[tname,ttime,tcontent]]
        return triplets

def writer(filename, sumlist):
        b = "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n<"+filename+">\n"
        e = "</"+filename+">\n"
        tstr = ""
        for ele in sumlist:
                tstr = tstr + "<"+str(ele[0])+">\n"
                if len(ele[1]) != 0:
                        for k in ele[1]:
                                tstr = tstr + "<tname>\n" + str(k[0]) + "\n</tname>\n"
                                tstr = tstr + "<tdate>\n" + str(k[1]) + "\n</tdate>\n"
                                tstr = tstr + "<tcontent>\n" + str(k[2]) + "\n</tcontent>\n"
                tstr = tstr + "</"+str(ele[0])+">\n"
        f = open(filename+'.xml','wt')
        f.write(b+tstr+e)
        f.close()

def writer_html(filename, sumlist):
    b = "<html>\n<table border=1>\n"
    e = "</table></html>\n"
    tstr = ""
    for ele in sumlist:
        tstr = tstr + "<tr>\n<td>\n------" + str(ele[0]) + "------<br /><br />"
        if len(ele[1]) != 0:
            for k in ele[1]:
                tname = str(k[0])
                tdate = str(k[1])
                tcontent = str(k[2])
                tname = tname[2:len(tname)-2]
                tdate = tdate[2:len(tdate)-2]
                tcontent = tcontent[2:len(tcontent)-2]
                ttstr = tname + tdate + tcontent
                ttstr = re.sub('\\\\t',' ',ttstr)
                ttstr = re.sub('\\\\r\\\\n','<br />',ttstr)
                ttstr = re.sub('<p><\/p>',' ',ttstr)
                tstr = tstr + ttstr
        tstr = tstr + "</td></tr>\n"
    f = open(filename+'.html','wt')
    f.write(b+tstr+e)
    f.close()

#main function loop

http_hampshiremall = "http://www.hampshiremall.com/calendar.asp?action=detail&incDate="
http_holyokemall = "http://www.holyokemall.com/calendar.asp?action=detail&incDate="

#hampshiremall
sumlist = []
sumlist_html = []
maxdays = 7
filename = "hampshiremall"
for days in range(0, maxdays):
        sdate = gendatestr(days)
        s = crawler(http_hampshiremall, days)
        triplets = parser(s)
        sumlist = sumlist + [["M_D_Y"+sdate.replace("/","_"),triplets]]
        sumlist_html = sumlist_html + [[sdate,triplets]]
writer(filename,sumlist)
writer_html(filename,sumlist_html)
       
#holyokemall
sumlist = []
sumlist_html = []
maxdays = 7
filename = "holyokemall"
for days in range(0, maxdays):
        sdate = gendatestr(days)
        s = crawler(http_holyokemall, days)
        triplets = parser(s)
        sumlist = sumlist + [["M_D_Y"+sdate.replace("/","_"),triplets]]
        sumlist_html = sumlist_html + [[sdate,triplets]]
writer(filename,sumlist)
writer_html(filename,sumlist_html)

[/code]

imac 2006-7-14 09:31 PM

faint... html kills all my tab indent

python need tab indent to identify logic structure...

so if anyone want to use this code, email me to ask for it or add the indents yourself (it is not difficult if you can understand the code...)

good luck

chao 2006-7-14 09:31 PM

顶啊!
大家很快就能local的一些deal在主页上了
[url]http://www.umasscssa.org/[/url]

那一片俗 2006-7-21 01:21 AM

虽然对编程一窍不通,但是。。。。。。。。。。。。。。还是要来灌一次水
嘿嘿

freei 2006-9-8 05:01 PM

呵呵,偶在上baby course, 191B, special interest group里面有python group, 会不会特别难~


[quote]原帖由 [i]那一片俗[/i] 于 2006-7-21 12:21 AM 发表
虽然对编程一窍不通,但是。。。。。。。。。。。。。。还是要来灌一次水
嘿嘿 [/quote]

yuxi 2006-9-8 05:05 PM

俺们系要开python seminar,看来俺要去上一个。

chao 2006-9-8 05:19 PM

欢迎参加python discussion!
有什么问题放上来吧
imac免费解答!:):)

满弓刀 2006-9-8 05:37 PM

往哪放啊?

chao 2006-9-8 09:34 PM

[quote]原帖由 [i]满弓刀[/i] 于 2006-9-8 05:37 PM 发表
往哪放啊? [/quote]
贴上在这个版啊
页: [1]
查看完整版本: the python code for holyokemall/hampshiremall deal info