如下所示:#coding=utf-8importsys,re,osdefgetDictList(dict):regx='''[\w\~`\!\@\#\$\%\^\&\*\(\)\_\-\+\=\[\]\{\}\:\;\,
如下所示:
#coding=utf-8
import sys, re, os
def getDictList(dict):
regx = '''[\w\~`\!\@\#\$\%\^\&\*\(\)\_\-\+\=\[\]\{\}\:\;\,\.\/\<\>\?]+'''
with open(dict) as f:
data = f.read()
return re.findall(regx, data)
def rmdp(dictList):
return list(set(dictList))
def fileSave(dictRmdp, out):
with open(out, 'a') as f:
for line in dictRmdp:
f.write(line + '\n')
def main():
try:
dict = sys.argv[1].strip()
out = sys.argv[2].strip()
except Exception, e:
print 'error:', e
me = os.path.basename(__file__)
print 'usage: %s <input> <output>' %me
print 'example: %s dict.txt dict_rmdp.txt' %me
exit()
dictList = getDictList(dict)
dictRmdp = rmdp(dictList)
fileSave(dictRmdp, out)
if __name__ == '__main__':
main()
以上这篇python 高效去重复 支持GB级别大文件的示例代码就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持脚本之家。
python 去重复 大文件