From 36b993d59a212925eba284df093c39af8d8641e4 Mon Sep 17 00:00:00 2001 From: Alex Yancey Date: Wed, 19 Jun 2024 22:15:55 -0700 Subject: [PATCH 1/5] Use lxml for HTML parsing instead of BeautifulSoup --- myusps/__init__.py | 47 ++++++++++++++++++++++------------------------ setup.py | 2 +- 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/myusps/__init__.py b/myusps/__init__.py index ccf36b7..173702e 100644 --- a/myusps/__init__.py +++ b/myusps/__init__.py @@ -4,8 +4,7 @@ import logging import os.path import pickle -import re -from bs4 import BeautifulSoup +import lxml.html from dateutil.parser import parse import requests from requests.auth import AuthBase @@ -62,15 +61,15 @@ def _load_cookies(filename): def _get_primary_status(row): """Get package primary status.""" try: - return row.find('div', {'class': 'pack_h3'}).string - except AttributeError: + return row.xpath(".//div[contains(@class,'pack_h3')]")[0].text.strip() + except IndexError: return None def _get_secondary_status(row): """Get package secondary status.""" try: - return row.find('div', {'id': 'coltextR3'}).contents[1] + return row.xpath(".//div[@id='coltextR3']/text()")[1].strip() except (AttributeError, IndexError): return None @@ -78,10 +77,10 @@ def _get_secondary_status(row): def _get_shipped_from(row): """Get where package was shipped from.""" try: - spans = row.find('div', {'id': 'coltextR2'}).find_all('span') + spans = row.xpath(".//div[@id='coltextR2']/span") if len(spans) < 2: return None - return spans[1].string + return spans[2].text except AttributeError: return None @@ -89,10 +88,10 @@ def _get_shipped_from(row): def _get_status_timestamp(row): """Get latest package timestamp.""" try: - divs = row.find('div', {'id': 'coltextR3'}).find_all('div') + divs = row.xpath(".//div[@id='coltextR3']/div") if len(divs) < 2: return None - timestamp_string = divs[1].string + timestamp_string = divs[1].text except AttributeError: return None try: @@ -104,8 +103,8 @@ def _get_status_timestamp(row): def _get_delivery_date(row): """Get delivery date (estimated or actual).""" try: - month = row.find('div', {'class': 'date-small'}).string - day = row.find('div', {'class': 'date-num-large'}).string + month = row.xpath(".//div[contains(@class,'date-small')]")[0].text.strip() + day = row.xpath(".//div[contains(@class,'date-num-large')]")[0].text.strip() except AttributeError: return None try: @@ -117,15 +116,15 @@ def _get_delivery_date(row): def _get_tracking_number(row): """Get package tracking number.""" try: - return row.find('div', {'class': 'pack_h4'}).string - except AttributeError: + return row.xpath(".//div[@class='pack_h4']")[0].text.strip() + except IndexError: return None def _get_mailpiece_image(row): """Get mailpiece image url.""" try: - return row.find('img', {'class': 'mailpieceIMG'}).get('src') + return row.xpath(".//img[@class='mailpieceIMG']/@src")[0] except AttributeError: return None @@ -223,15 +222,13 @@ def get_profile(session): response = session.get(PROFILE_URL, allow_redirects=False) if response.status_code == 302: raise USPSError('expired session') - parsed = BeautifulSoup(response.text, HTML_PARSER) - profile = parsed.find('div', {'class': 'atg_store_myProfileInfo'}) + parsed = lxml.html.fromstring(response.text) + profile = parsed.xpath("//div[@class='atg_store_myProfileInfo']")[0] data = {} - for row in profile.find_all('tr'): - cells = row.find_all('td') + for row in profile.xpath('.//tr'): + cells = row.xpath('.//td') if len(cells) == 2: - key = ' '.join(cells[0].find_all(text=True)).strip().lower().replace(' ', '_') - value = ' '.join(cells[1].find_all(text=True)).strip() - data[key] = value + data[cells[0].text.strip()] = cells[1].text.strip() return data @@ -240,9 +237,9 @@ def get_packages(session): """Get package data.""" _LOGGER.info("attempting to get package data") response = _get_dashboard(session) - parsed = BeautifulSoup(response.text, HTML_PARSER) + parsed = lxml.html.fromstring(response.text) packages = [] - for row in parsed.find_all('div', {'class': 'pack_row'}): + for row in parsed.xpath("//div[@class='pack_row']"): packages.append({ 'tracking_number': _get_tracking_number(row), 'primary_status': _get_primary_status(row), @@ -261,9 +258,9 @@ def get_mail(session, date=None): if not date: date = datetime.datetime.now().date() response = _get_dashboard(session, date) - parsed = BeautifulSoup(response.text, HTML_PARSER) + parsed = lxml.html.fromstring(response.text) mail = [] - for row in parsed.find_all('div', {'class': 'mailpiece'}): + for row in parsed.xpath("//div[@class='mailpiece']"): image = _get_mailpiece_image(row) if not image: continue diff --git a/setup.py b/setup.py index e1fb031..6d37145 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ author='happyleaves', author_email='happyleaves.tfr@gmail.com', packages=find_packages(), - install_requires=['beautifulsoup4==4.6.0', 'python-dateutil==2.6.0', 'requests>=2.20.0', 'requests-cache==0.4.13', 'selenium==3.11.0'], + install_requires=['lxml==5.2.2', 'python-dateutil==2.6.0', 'requests>=2.20.0', 'requests-cache==0.4.13', 'selenium==3.11.0'], classifiers=[ 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', From a2601cb4fe6b951d1d3fb89f06e6bfa3a677a44d Mon Sep 17 00:00:00 2001 From: Alex Yancey Date: Wed, 19 Jun 2024 22:42:17 -0700 Subject: [PATCH 2/5] Update libraries, replace Selenium with Playwright --- myusps/__init__.py | 93 +++++++++++++++++++++------------------------- setup.py | 3 +- 2 files changed, 45 insertions(+), 51 deletions(-) diff --git a/myusps/__init__.py b/myusps/__init__.py index 173702e..daed07b 100644 --- a/myusps/__init__.py +++ b/myusps/__init__.py @@ -9,11 +9,7 @@ import requests from requests.auth import AuthBase import requests_cache -from selenium import webdriver -from selenium.common.exceptions import TimeoutException, WebDriverException -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.firefox.options import Options +from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError _LOGGER = logging.getLogger(__name__) @@ -32,12 +28,12 @@ ATTRIBUTION = 'Information provided by www.usps.com' USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) ' \ 'Chrome/41.0.2228.0 Safari/537.36' -CHROME_WEBDRIVER_ARGS = [ - '--headless', '--user-agent={}'.format(USER_AGENT), '--disable-extensions', - '--disable-gpu', '--no-sandbox' -] -FIREFOXOPTIONS = Options() -FIREFOXOPTIONS.add_argument("--headless") +# CHROME_WEBDRIVER_ARGS = [ +# '--headless', '--user-agent={}'.format(USER_AGENT), '--disable-extensions', +# '--disable-gpu', '--no-sandbox' +# ] +# FIREFOXOPTIONS = Options() +# FIREFOXOPTIONS.add_argument("--headless") class USPSError(Exception): @@ -140,21 +136,8 @@ def _get_mailpiece_url(image): """Get mailpiece url.""" return '{}{}'.format(INFORMED_DELIVERY_IMAGE_URL, image) -def _get_driver(driver_type): - """Get webdriver.""" - if driver_type == 'phantomjs': - return webdriver.PhantomJS(service_log_path=os.path.devnull) - if driver_type == 'firefox': - return webdriver.Firefox(firefox_options=FIREFOXOPTIONS) - elif driver_type == 'chrome': - chrome_options = webdriver.ChromeOptions() - for arg in CHROME_WEBDRIVER_ARGS: - chrome_options.add_argument(arg) - return webdriver.Chrome(chrome_options=chrome_options) - else: - raise USPSError('{} not supported'.format(driver_type)) -def _login(session): +def _login(session, driver, headless): """Login. Use Selenium webdriver to login. USPS authenticates users @@ -170,23 +153,35 @@ def _login(session): session.remove_expired_responses() except AttributeError: pass - try: - driver = _get_driver(session.auth.driver) - except WebDriverException as exception: - raise USPSError(str(exception)) - driver.get(LOGIN_URL) - username = driver.find_element_by_name('username') - username.send_keys(session.auth.username) - password = driver.find_element_by_name('password') - password.send_keys(session.auth.password) - driver.find_element_by_id('btn-submit').click() - try: - WebDriverWait(driver, LOGIN_TIMEOUT).until(EC.title_is(WELCOME_TITLE)) - except TimeoutException: - raise USPSError('login failed') - for cookie in driver.get_cookies(): - session.cookies.set(name=cookie['name'], value=cookie['value']) - _save_cookies(session.cookies, session.auth.cookie_path) + + with sync_playwright() as p: + if driver == "chrome": + browser = p.chromium.launch(headless=headless) + elif driver == "firefox": + browser = p.firefox.launch(headless=headless) + elif driver == "webkit": + browser = p.webkit.launch(headless=headless) + else: + raise USPSError('{} not supported'.format(driver)) + + context = browser.new_context(user_agent=USER_AGENT) + page = context.new_page() + page.goto(LOGIN_URL) + + page.locator("xpath=//input[@id='username']").type(session.auth.username) + page.locator("xpath=//input[@id='password']").type(session.auth.password) + + page.locator("xpath=//button[@id='btn-submit']").click() + + try: + page.wait_for_function("document.title === '{}'".format(WELCOME_TITLE)) + except PlaywrightTimeoutError: + raise USPSError('login failed') + + for cookie in context.cookies(): + session.cookies.set(name=cookie["name"], value=cookie["value"]) + + _save_cookies(session.cookies, session.auth.cookie_path) def _get_dashboard(session, date=None): @@ -273,17 +268,16 @@ def get_mail(session, date=None): # pylint: disable=too-many-arguments def get_session(username, password, cookie_path=COOKIE_PATH, cache=True, - cache_expiry=300, cache_path=CACHE_PATH, driver='phantomjs'): + cache_expiry=300, cache_path=CACHE_PATH, driver='chrome', headless=False): """Get session, existing or new.""" class USPSAuth(AuthBase): # pylint: disable=too-few-public-methods """USPS authorization storage.""" - def __init__(self, username, password, cookie_path, driver): + def __init__(self, username, password, cookie_path): """Init.""" self.username = username self.password = password self.cookie_path = cookie_path - self.driver = driver def __call__(self, r): """Call is no-op.""" @@ -291,13 +285,12 @@ def __call__(self, r): session = requests.Session() if cache: - session = requests_cache.core.CachedSession(cache_name=cache_path, - expire_after=cache_expiry) - session.auth = USPSAuth(username, password, cookie_path, driver) + session = requests_cache.CachedSession(cache_name=cache_path, expire_after=cache_expiry) + session.auth = USPSAuth(username, password, cookie_path) session.headers.update({'User-Agent': USER_AGENT}) if os.path.exists(cookie_path): _LOGGER.debug("cookie found at: %s", cookie_path) session.cookies = _load_cookies(cookie_path) else: - _login(session) - return session \ No newline at end of file + _login(session, driver, headless) + return session diff --git a/setup.py b/setup.py index 6d37145..1d30fe2 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,8 @@ author='happyleaves', author_email='happyleaves.tfr@gmail.com', packages=find_packages(), - install_requires=['lxml==5.2.2', 'python-dateutil==2.6.0', 'requests>=2.20.0', 'requests-cache==0.4.13', 'selenium==3.11.0'], + install_requires=['lxml==5.2.2', 'python-dateutil==2.9.0.post0', 'requests>=2.32.3', 'requests-cache==1.2.1', + 'playwright==1.44.0'], classifiers=[ 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', From c68d4c3ec4831ddde18da74ff76c11193774d7a1 Mon Sep 17 00:00:00 2001 From: Alex Yancey Date: Wed, 19 Jun 2024 22:42:17 -0700 Subject: [PATCH 3/5] Update libraries, replace Selenium with Playwright --- myusps/__init__.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/myusps/__init__.py b/myusps/__init__.py index daed07b..1ed9cb3 100644 --- a/myusps/__init__.py +++ b/myusps/__init__.py @@ -28,12 +28,6 @@ ATTRIBUTION = 'Information provided by www.usps.com' USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) ' \ 'Chrome/41.0.2228.0 Safari/537.36' -# CHROME_WEBDRIVER_ARGS = [ -# '--headless', '--user-agent={}'.format(USER_AGENT), '--disable-extensions', -# '--disable-gpu', '--no-sandbox' -# ] -# FIREFOXOPTIONS = Options() -# FIREFOXOPTIONS.add_argument("--headless") class USPSError(Exception): @@ -278,6 +272,7 @@ def __init__(self, username, password, cookie_path): self.username = username self.password = password self.cookie_path = cookie_path + self.driver = driver def __call__(self, r): """Call is no-op.""" From c338b939ff8b9e5ab3a1b8fb32240e4d0f91a291 Mon Sep 17 00:00:00 2001 From: Alex Yancey Date: Wed, 19 Jun 2024 22:46:18 -0700 Subject: [PATCH 4/5] oops --- myusps/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/myusps/__init__.py b/myusps/__init__.py index 1ed9cb3..5d5ea29 100644 --- a/myusps/__init__.py +++ b/myusps/__init__.py @@ -272,7 +272,6 @@ def __init__(self, username, password, cookie_path): self.username = username self.password = password self.cookie_path = cookie_path - self.driver = driver def __call__(self, r): """Call is no-op.""" From 32754bef76b497eb559a8ee8d46b1b1d639581c0 Mon Sep 17 00:00:00 2001 From: Alex Yancey Date: Wed, 19 Jun 2024 22:55:16 -0700 Subject: [PATCH 5/5] update readme --- README.md | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 785496d..7917f37 100644 --- a/README.md +++ b/README.md @@ -10,19 +10,12 @@ Python 3 API for [USPS Informed Delivery](https://my.usps.com/mobileWeb/pages/in Sign up for Informed Delivery and verify your address. -### Chrome - -Install Google Chrome and Chromedriver. These are dependencies for the Selenium webdriver, which is used internally to this module to facilitate the login process. - -Instructions (adapt as necessary for your OS): - - Ubuntu 16: https://gist.github.com/ziadoz/3e8ab7e944d02fe872c3454d17af31a5 - - RHEL 7: https://stackoverflow.com/a/46686621 - -Note that installing Selenium Server is not required. - ## Install -`pip install myusps` +```shell +pip install myusps +playwright install +``` ## Usage @@ -32,7 +25,8 @@ import myusps # Establish a session. # Use the login credentials you use to login to My USPS via the web. # A login failure raises a `USPSError`. -session = myusps.get_session("username", "password") +# Webdriver options are 'firefox', 'chrome', and 'webkit' +session = myusps.get_session("username", "password", driver="firefox") # Get your profile information as a dict. Includes name, address, phone, etc. profile = myusps.get_profile(session)