Image_Text_Comparaison_AI/app.py at main · shadlia/Image_Text_Comparaison_AI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
import time
from htmlExtractor import ImageDownloader
from ImageTextComparator import ImageTextComparator
from HTMLextractor import HTMLExtractor
from modelExtraction import ArticleExtractor


def are_images_downloaded(folder_name, valid_extensions):
    """Check if images have been downloaded in the specified folder."""
    files = os.listdir(folder_name)
    return any(file.lower().endswith(valid_extensions) for file in files)


def main():
    url = "https://www.purepeople.com/article/en-plein-divorce-avec-luana-paul-belmondo-partage-un-beau-moment-avec-leur-fils-victor-et-l-immortalise-en-photo_a526200/1"
    folder_name = "imagesCache"
    valid_extensions = (".png", ".jpeg", ".jpg", ".gif")

    # Create an instance of ImageDownloader
    downloader = ImageDownloader()

    # Call the image_download function
    downloader.image_download(
        url,
        folder_name,
        valid_extensions,
        exclude_keywords=("icon",),
        min_size=(100, 100),  # Minimum width and height
    )

    # Wait until images are downloaded
    while not are_images_downloaded(folder_name, valid_extensions):
        print("Waiting for images to be downloaded...")
        time.sleep(5)  # Wait for 5 seconds before checking again

    # Set up API key and paths
    # ---------------------------------------------------------------------------------------------------#
    """    api_key = "YOUR API KEY"
    text_file_path = "article.txt"
    comparator = ImageTextComparator(api_key)
    comparator.generate_response(text_file_path, folder_name)

    # Create an instance of the comparator
        comparator = ImageTextComparator(api_key)

        # Compare images with text
        response_text = comparator.image_download_and_compare(text_file_path, folder_name)
        print(response_text)

    """
    # -------------------------------------------------------------------------------------------------------------#
    html_extractor = HTMLExtractor()
    print("-------------------extracting HTML------------------...")
    html_content = html_extractor.get_html(url)
    print(html_content)
    print("---------------END EXTRACTING HTML-----------------------")
    llm = ArticleExtractor()
    llm.extract_content(html_content=html_content, image_folder_path=folder_name)


if __name__ == "__main__":
    main()