用FlashGet批量下载文件后面临一个文件名更改过滤的问题。我的解决方案是:
1.在批量下载时,同时下载该网页的HTML文件;
2.用Python来过滤出对应的HTML文件;
3.将过滤文件修补成Bat文件,运行后则可批量更改名称;
=====================================================================
Python过滤源代码
# -*- coding: cp936 -*-
# yankchina@gmail.com
# 2007-06-26 pass
from sgmllib import SGMLParser
class MyHtmlFilter(SGMLParser):
    def reset(self):
        SGMLParser.reset(self)
        self.inValid = False
        self.fileHrefs = []
        self.fileNames = []
    def start_a(self,attrs):
        #idText = [ v for k, v in attrs if k == 'class' ]
        fileHref = [ v for k, v in attrs if k == 'href' ]
        #if idText == 'a01' :
        self.inValid = True
        self.fileHrefs.append( fileHref )
    def end_a(self):
        self.inValid = False
   
    def handle_data(self, text ):
        if self.inValid:
            self.fileNames.append( text )
       
       
def FilteHtmlFile( inFileName,outFile ):
    ''' Filte Html File
    '''
    import os
    if os.path.exists( inFileName ):
        inFile = open( inFileName, "r")
        parser = MyHtmlFilter()
        parser.feed( inFile.read() )
        inFile.close()
        parser.close()
        print len( parser.fileNames )
        for i in range( len( parser.fileNames) ):
            outFile.write( parser.fileHrefs[i][0] + "   " + parser.fileNames[i]+ ".doc \n " )
            print parser.fileNames[i]
            print parser.fileHrefs[i]
        print inFileName, len( parser.fileHrefs )
       
if __name__ == "__main__" :
    outFileName = 'output.txt'
    outFile = open( outFileName, "w" )
    htmlFileName = "1.htm"
    FilteHtmlFile(htmlFileName, outFile )
    outFile.close()
 
 
 博文
博文
 
 
0 评论:
发表评论