![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() ![]() |
![]() |
以下介紹的,都是針對「輸入文字型」的驗證碼的實作!
# 可以處理文字辨識
from PIL import Image
import pytesseract
print(pytesseract.image_to_string(Image.open('slide_image/ocr-eng.png'),lang="eng"))
Pyladies is a group of women developers who love the Python programming language. We are an intemat‘lonal mentorship group 9 with a focus on helping more women become active participants and leaders in the Python open-source community. We host monthly meetups with different topics such as beginners meetups, project of python presentation and tutorial Open to all who identify as women. Feel free to join us!
from PIL import Image
import pytesseract
print(pytesseract.image_to_string(Image.open('captcha/1-284176.png')))
print(pytesseract.image_to_string(Image.open('captcha/1-882579.png')))
284176 882579
# 有雜訊的驗證碼,直接以pytesseract判別,效果很差
from PIL import Image
import pytesseract
print(pytesseract.image_to_string(Image.open('captcha/2-J45Z.png')))
print(pytesseract.image_to_string(Image.open('captcha/2-R59X.png')))
print(pytesseract.image_to_string(Image.open('captcha/2-4089.png')))
print(pytesseract.image_to_string(Image.open('captcha/2-9198.png')))
print(pytesseract.image_to_string(Image.open('captcha/2-0479.png')))
print(pytesseract.image_to_string(Image.open('captcha/2-1430.png')))
print(pytesseract.image_to_string(Image.open('captcha/2-49586.png')))
print(pytesseract.image_to_string(Image.open('captcha/2-88860.png')))
J45Z 9198 49536, ‘ 89860 V
def threshold(filein, fileout, limit=100):
img = Image.open(filein)
m = 1.5
img = img.convert('RGBA')
pixdata = img.load()
for y in range(img.size[1]):
for x in range(img.size[0]):
# 讓RGB三個顏色其一小於 threshold 值,就轉成 黑色
if pixdata[x, y][0] < limit or pixdata[x, y][1] < limit or pixdata[x, y][2] < limit:
pixdata[x, y] = (0, 0, 0, 255)
else:
pixdata[x, y] = (255, 255, 255, 255)
img.save(fileout)
from PIL import Image
import pytesseract
threshold('captcha/2-R59X.png','captcha/2-R59X_threshold.png')
print(pytesseract.image_to_string(Image.open('captcha/2-R59X.png')))
print(pytesseract.image_to_string(Image.open('captcha/2-R59X_threshold.png')))
R5 9X
通常多為需要購買才能使用其服務,優點是速度快
=> 會選擇 2captcha 是剛好搜尋到,也可以不需信用卡帳戶或是付費才能使用
=> 自己當工人解碼6張,大約有0.006,API處理一張簡單的驗證碼花費0.00095
=> 解一張可以讓API使用10次
from captcha2 import CaptchaUpload
import time
tstart = time.time()
_API_KEY = "YOUR_KEY"
captcha = CaptchaUpload(_API_KEY)
print (captcha.solve('captcha/1-284176.png'))
tstop = time.time()
print (tstop-tstart)
284176 15.581701040267944
from PIL import Image
import pytesseract
import time
tstart = time.time()
print(pytesseract.image_to_string(Image.open('captcha/1-284176.png')))
tstop = time.time()
print (tstop-tstart)
284176 0.1440131664276123
from PIL import Image
img = Image.open('slide_image/govtw_cmpyinfo.png')
img2 = img.crop((240, 110, 350, 140)) # left top right bottom
img2.save('temp.png')
img2
試試看,取出「經濟部商業司─公司及分公司基本資料查詢」網頁的驗證碼 http://gcis.nat.gov.tw/pub/cmpy/cmpyInfoListAction.do
from selenium import webdriver
import signal
# 設定瀏覽器、畫面大小、開啟網頁、截圖
driver = webdriver.PhantomJS()
driver.set_window_size(1024, 768)
driver.get("http://gcis.nat.gov.tw/pub/cmpy/cmpyInfoListAction.do")
driver.save_screenshot('cmpyInfo_raw.jpg')
# 裁切圖片
# Your Code
# 別忘記要關閉process
driver.service.process.send_signal(signal.SIGTERM)
driver.quit()
觀察要點:對著驗證碼按下右鍵「在新分頁中開啟圖片」,重新整理頁面
from selenium import webdriver
from selenium.webdriver.common.by import By
from PIL import Image
import pytesseract
import signal
# 設定瀏覽器、畫面大小、開啟網頁、截圖
driver = webdriver.PhantomJS()
driver.set_window_size(1024, 768)
driver.get("http://gcis.nat.gov.tw/pub/cmpy/cmpyInfoListAction.do")
driver.save_screenshot('cmpyInfo_raw.jpg')
# 裁切圖片
img = Image.open('cmpyInfo_raw.jpg')
img2 = img.crop((260, 145, 370, 180)) # left top right bottom
img2.save('captcha.jpg')
# 解析驗證碼文字
captcha = pytesseract.image_to_string(Image.open('captcha.jpg'))
print (captcha)
# 填入資料
text_box = driver.find_element(By.XPATH, "//input[@name='queryStr']")
text_box.send_keys("玩咖旅行社")
text_box = driver.find_element(By.XPATH, "//input[@name='imageCode']")
text_box.send_keys(captcha)
driver.save_screenshot('cmpyInfo_ready.jpg')
# 送出
button = driver.find_element(By.XPATH, "//input[@name='submitData']")
button.click()
driver.save_screenshot('cmpyInfo_submit.jpg')
# 別忘記要關閉process
driver.service.process.send_signal(signal.SIGTERM)
driver.quit()
277133
import requests
from lxml import etree
from PIL import Image
import pytesseract
# 開啟網頁
resp = requests.get("http://www.post.gov.tw/post/internet/SearchZone/index.jsp?ID=130112")
# 獲取圖片網址
html = resp.text
page = etree.HTML(html)
image_src = page.xpath("//img[@id='imgCaptcha3']/@src")[0]
vKey = image_src.split("&vKey=")[-1]
# 下載圖片
resp_captcha = requests.get("http://www.post.gov.tw/post/internet/"+image_src.replace("../",""))
img = resp_captcha.content
fileout = open("post_gov_captcha.jpg","wb")
fileout.write(img)
fileout.close()
# 解析驗證碼文字
captcha = pytesseract.image_to_string(Image.open('post_gov_captcha.jpg'))
print (captcha)
8508
# 以requests的get/post來送出資料
post_data = {
"do_s_1":"1",
"vKey": vKey,
"city":"臺北市",
"change_city":"2",
"cityarea":"中山區",
"street":"中山北路2段",
"lane":"",
"alley":"",
"num":"31",
"num_hyphen":"",
"fl":"9",
"hyphen":"",
"suite":"",
"list":"true",
"checkImange":captcha,
"submit":"查詢"
}
resp = requests.post("http://www.post.gov.tw/post/internet/Postal/index.jsp?ID=207",data=post_data)
html = resp.text
page = etree.HTML(html)
eng_address = "".join(page.xpath("//table[contains(@class,'TableStyle_02')][1]//tr[2]//text()")).strip()
print (eng_address)
9F., No.31, Sec. 2, Zhongshan N. Rd., Zhongshan Dist., Taipei City 104, Taiwan (R.O.C.)
試試看,以取貨編號 E42981808304 在「E-Tracking 交易系統」查詢
https://eservice.7-11.com.tw/E-Tracking/search.aspx
import requests
from lxml import etree
from PIL import Image
import pytesseract
resq = requests.Session()
resp = resq.get("https://eservice.7-11.com.tw/E-Tracking/search.aspx")
html = resp.text
page = etree.HTML(html)
image_src = page.xpath("//img[@id='ImgVCode']/@src")[0]
__VIEWSTATE = page.xpath("//input[@name='__VIEWSTATE']/@value")[0]
__VIEWSTATEGENERATOR = page.xpath("//input[@name='__VIEWSTATEGENERATOR']/@value")[0]
# 下載圖片
resp_captcha = resq.get("https://eservice.7-11.com.tw/E-Tracking/"+image_src)
img = resp_captcha.content
fileout = open("7-11_captcha.jpg","wb")
fileout.write(img)
fileout.close()
# 解析驗證碼文字
captcha = pytesseract.image_to_string(Image.open('7-11_captcha.jpg'))
print (captcha)
# 以requests的get/post來送出資料
post_data = {
"__EVENTTARGET":"",
"__EVENTARGUMENT":"",
"__VIEWSTATE":__VIEWSTATE,
"__VIEWSTATEGENERATOR":__VIEWSTATEGENERATOR,
"txtProductNum":"E42981808304",
"tbChkCode":captcha,
"btUserSearch":"查 詢"
}
resp = resq.post("https://eservice.7-11.com.tw/E-Tracking/search.aspx",data=post_data)
html = resp.content.decode("utf8")
print (html)
Unix系統上同時有Python2.x與Python3.x的話,要安裝給Python3.x需使用:pip3
sudo apt-get install tesseract-ocr
https://github.com/tesseract-ocr/tesseract/wiki/Downloads
pip install pytesseract
pip3 install pytesseract
安裝captcha2,因為官方範例程式captcha2upload只支援Python 2.x。
pip install captcha2
pip3 install captcha2