-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
63 lines (50 loc) · 2.3 KB
/
app.py
File metadata and controls
63 lines (50 loc) · 2.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
import time
from htmlExtractor import ImageDownloader
from ImageTextComparator import ImageTextComparator
from HTMLextractor import HTMLExtractor
from modelExtraction import ArticleExtractor
def are_images_downloaded(folder_name, valid_extensions):
"""Check if images have been downloaded in the specified folder."""
files = os.listdir(folder_name)
return any(file.lower().endswith(valid_extensions) for file in files)
def main():
url = "https://www.purepeople.com/article/en-plein-divorce-avec-luana-paul-belmondo-partage-un-beau-moment-avec-leur-fils-victor-et-l-immortalise-en-photo_a526200/1"
folder_name = "imagesCache"
valid_extensions = (".png", ".jpeg", ".jpg", ".gif")
# Create an instance of ImageDownloader
downloader = ImageDownloader()
# Call the image_download function
downloader.image_download(
url,
folder_name,
valid_extensions,
exclude_keywords=("icon",),
min_size=(100, 100), # Minimum width and height
)
# Wait until images are downloaded
while not are_images_downloaded(folder_name, valid_extensions):
print("Waiting for images to be downloaded...")
time.sleep(5) # Wait for 5 seconds before checking again
# Set up API key and paths
# ---------------------------------------------------------------------------------------------------#
""" api_key = "YOUR API KEY"
text_file_path = "article.txt"
comparator = ImageTextComparator(api_key)
comparator.generate_response(text_file_path, folder_name)
# Create an instance of the comparator
comparator = ImageTextComparator(api_key)
# Compare images with text
response_text = comparator.image_download_and_compare(text_file_path, folder_name)
print(response_text)
"""
# -------------------------------------------------------------------------------------------------------------#
html_extractor = HTMLExtractor()
print("-------------------extracting HTML------------------...")
html_content = html_extractor.get_html(url)
print(html_content)
print("---------------END EXTRACTING HTML-----------------------")
llm = ArticleExtractor()
llm.extract_content(html_content=html_content, image_folder_path=folder_name)
if __name__ == "__main__":
main()