2024年8月2日 星期五

使用Selenium 做爬蟲 Perl

####################### fedora linux:####################

$ sudo yum install python3-pip
$ sudo pip3 install selenium
$ export DISPLAY=:99
$ Xvfb :99 -screen 0 1024x768x16 &

$ cat test.pl
#!/usr/bin/perl
use Selenium::Remote::Driver;
#use Selenium::Firefox;

my $driver = Selenium::Remote::Driver->new(
        'browser_name' => 'firefox',
        'remote_server_addr' => '127.0.0.1',
        'port' => '4444',
        'platform' => 'linux',
    'auto_close' => true,
    'debug' => false
);
#my $driver = Selenium::Firefox->new;

$driver->set_timeout("implicit",10000);
$driver->set_implicit_wait_timeout(10000);
#$driver->get("https://www.google.com");
$driver->get("https://www.fedex.com/zh-tw/shipping/surcharges.html");
print $driver->get_title();

#############################################################
#兩種寫法都可以
#############################################################
#my @buttons = $driver->find_elements("//button[3]");
#print "@buttons\n";
#$buttons[0]->click;
#############################################################
my $button = $driver->find_element("//button[3]");
print "$button\n";
$button->click;
#############################################################
$driver->refresh;

my $data = $driver->find_element("//div[2]/div/table/tbody/tr/td");
#print "$data\n";
print "\n". $data->get_text(). "\n";
my $data = $driver->find_element("//div[2]/div/table/tbody/tr/td[2]");
#print "$data\n";
print "\n". $data->get_text(). "\n";
my $data = $driver->find_element("//div[2]/div/table/tbody/tr/td[3]");
#print "$data\n";
print "\n".$data->get_text() ."\n";

#$driver->close;




2024年8月1日 星期四

使用Selenium 做爬蟲 by python

sudo apt install python3-pip
sudo apt install python3-selenium
sudo apt install python3-bs4

#!/usr/bin/python3

# 載入需要的套件
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.service import Service
import requests
import time

# 開啟瀏覽器視窗(Chrome)# 方法一:執行前需開啟chromedriver.exe且與執行檔在同一個工作目錄
#driver = webdriver.Chrome()
#service = Service(executable_path='/usr/local/bin/geckodriver')
service = Service('/usr/local/bin/geckodriver')
driver = webdriver.Firefox(service=service)
#driver = webdriver.Firefox()

# 方法二:或是直接指定exe檔案路徑
#driver = webdriver.Firefox("/usr/local/bin")

driver.implicitly_wait(3) #等10秒,讓網頁資料load進來
driver.get("https://www.fedex.com/zh-tw/shipping/surcharges.html") # 更改網址以前往不同網頁
#time.sleep(100)

#driver.find_elements(By.CLASS_NAME,"fxg-gdpr__accept-all-btn cc-aem-c-button cc-aem-c-button--responsive cc-aem-c-button--primary")[0].click()
#buttons=driver.find_elements(By.CLASS_NAME,"fxg-gdpr__accept-all-btn cc-aem-c-button cc-aem-c-button--responsive cc-aem-c-button--primary")

button=driver.find_elements(By.CSS_SELECTOR,".fxg-gdpr__accept-all-btn")[0] #使用Selenium IDE找出來 "accept all cookie" 的按鈕
print(button)
button.click()

driver.refresh() #接受cookie後,要將網頁refresh, 重load資料

#使用Selenium IDE找出來要抓資料
data=driver.find_elements(By.CSS_SELECTOR,".fuelsurcharg-dynamic-datalookup .cc-aem-c-table__tbody:nth-child(2) .cc-aem-c-table__tbody__td:nth-child(1)")[0]
print(data.get_attribute("innerText"))
data=driver.find_elements(By.CSS_SELECTOR,".fuelsurcharg-dynamic-datalookup .cc-aem-c-table__tbody:nth-child(2) .cc-aem-c-table__tbody__td:nth-child(2)")[0]
print(data.get_attribute("innerText"))
data=driver.find_elements(By.CSS_SELECTOR,".fuelsurcharg-dynamic-datalookup .cc-aem-c-table__tbody:nth-child(2) .cc-aem-c-table__tbody__td:nth-child(3)")[0]
print(data.get_attribute("innerText"))

#print (driver.title)
#html=driver.page_source
#print (html)

#soup = BeautifulSoup(driver.page_source, 'lxml')
#print(soup.prettify())
#
#with open('index.html', 'w', encoding='utf-8',) as file:
#    file.write(soup.prettify())

driver.close() # 關閉瀏覽器視窗