-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler2.py
More file actions
119 lines (97 loc) · 4.77 KB
/
crawler2.py
File metadata and controls
119 lines (97 loc) · 4.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import argparse
from urllib.parse import urlparse, parse_qs
def get_gallery_id_from_url(url):
"""Extracts the gallery ID from the URL's query string."""
try:
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)
return query_params.get('id', [None])[0]
except Exception:
return 'gallery'
def crawl_gallery(base_url, page_range, search_word=None, liked_number=None, liked_number_over=None):
"""
Crawls a DCInside gallery for a given URL and page range.
Args:
base_url (str): The base URL of the gallery list.
page_range (tuple): A tuple containing the start and end page numbers.
search_word (str, optional): A word to filter titles by. Defaults to None.
liked_number (int, optional): Exact number of likes to filter by. Defaults to None.
liked_number_over (int, optional): Minimum number of likes to filter by. Defaults to None.
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
all_posts = []
start_page, end_page = page_range
for page_num in range(start_page, end_page + 1):
# The page parameter might already be in the URL, so we handle that.
paginated_url = f"{base_url}&page={page_num}"
print(f"Crawling page: {paginated_url}")
try:
response = requests.get(paginated_url, headers=headers)
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"Error fetching page {page_num}: {e}")
continue
soup = BeautifulSoup(response.text, 'html.parser')
for tr in soup.select('tbody > tr'):
try:
num = tr.select_one('.gall_num').text.strip()
if not num.isdigit():
continue
title_element = tr.select_one('.gall_tit a')
title = title_element.text.strip()
# --- Search Word Filter ---
if search_word and search_word not in title:
continue
link = 'https://gall.dcinside.com' + title_element['href']
author = tr.select_one('.gall_writer').text.strip()
views = tr.select_one('.gall_count').text.strip()
liked = tr.select_one('.gall_recommend').text.strip()
# --- Liked Number Filter ---
if liked_number is not None and int(liked) != liked_number:
continue
if liked_number_over is not None and int(liked) <= liked_number_over:
continue
all_posts.append({
'Number': num,
'Title': title,
'Author': author,
'Views': views,
'Link': link,
'Liked': liked
})
except (AttributeError, TypeError):
continue
time.sleep(1)
if not all_posts:
print("No valid posts found across the specified pages with the given criteria.")
return
gallery_id = get_gallery_id_from_url(base_url)
df = pd.DataFrame(all_posts)
excel_filename = f"{gallery_id}.xlsx"
df.to_excel(excel_filename, index=False)
print(f"Saved {len(all_posts)} posts to {excel_filename}")
def main():
parser = argparse.ArgumentParser(description="Crawl a DCInside gallery and save posts to an Excel file.")
parser.add_argument('-l', '--link', required=True, help="The full URL of the gallery board list. E.g., 'https://gall.dcinside.com/mgallery/board/lists/?id=record'")
parser.add_argument('-p', '--pages', required=False, default='1-100', help="The range of pages to crawl. E.g., '1-5'. Defaults to '1-100'.")
parser.add_argument('-S', '--search-word', required=False, help="An optional word to search for in post titles.")
parser.add_argument('-L', '--liked-number', type=int, help="Only get posts with this exact number of likes.")
parser.add_argument('--liked-number-over', type=int, help="Only get posts with more than this number of likes.")
args = parser.parse_args()
try:
start_page, end_page = map(int, args.pages.split('-'))
if start_page <= 0 or end_page < start_page:
raise ValueError("Page range must be positive and in increasing order (e.g., 1-5).")
except ValueError as e:
print(f"Error: Invalid page range format. {e}")
return
print("Running crawler...")
crawl_gallery(args.link, (start_page, end_page), args.search_word, args.liked_number, args.liked_number_over)
if __name__ == "__main__":
main()