1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
| import sys import uuid import requests import hashlib import json import time import pdfplumber import jieba from collections import Counter from string import digits from imp import reload import xlwt
def encrypt(signStr): hash_algorithm = hashlib.sha256() hash_algorithm.update(signStr.encode('utf-8')) return hash_algorithm.hexdigest()
def truncate(q): if q is None: return None size = len(q) return q if size <= 20 else q[0:10] + str(size) + q[size - 10:size]
def PdfIdentifyText(): PdfContent = "" with pdfplumber.open('./file/SPLK-1001.199Q.pdf') as pdf: for temp in pdf.pages: PdfContent += temp.extract_text()
tokens = jieba.cut(PdfContent) return Counter(tokens).most_common()
def tranRequest(query_word): reload(sys)
YOUDAO_URL = 'https://openapi.youdao.com/api' APP_KEY = 'xxxx' APP_SECRET = 'xxxx' data = {}
q = query_word headers = {'Content-Type': 'application/x-www-form-urlencoded'} data['from'] = 'en' data['to'] = 'zh-CHS' data['signType'] = 'v3' curtime = str(int(time.time())) data['curtime'] = curtime salt = str(uuid.uuid1()) signStr = APP_KEY + truncate(q) + salt + curtime + APP_SECRET sign = encrypt(signStr) data['appKey'] = APP_KEY data['q'] = q data['salt'] = salt data['sign'] = sign
retult = requests.post(YOUDAO_URL, data=data, headers=headers)
contentType = retult.headers['Content-Type'] if contentType == "audio/mp3": millis = int(round(time.time() * 1000)) filePath = "./file/" + str(millis) + ".mp3" fo = open(filePath, 'wb') fo.write(retult.content) fo.close() else: return json.loads(retult.text)
def ToCsvFile(data): data = data["count"]+ "," + data["EnglishWord"] + "," + data["EnglishExplains"]+'\r\n' csvFile.write(data.encode())
def DataClea(data): for temp in data: if len(temp[0].translate(str.maketrans('', '', digits))) > 1: tranRequestContent = tranRequest(temp[0]) dictWord = {} dictWord["count"] = str(temp[1]) dictWord["EnglishWord"] = tranRequestContent["query"].lower() try: dictWord["EnglishExplains"] = "||".join(tranRequestContent["basic"]["explains"]) except: print(tranRequestContent["query"].lower() + '单词翻译出错!') dictWord["EnglishExplains"] = " " ToCsvFile(dictWord)
if __name__ == '__main__': csvFile = open('./file/works.csv', "wb") PdfTextContent = PdfIdentifyText() DataCleaContent = DataClea(PdfTextContent) csvFile.close()
|