XThomasBU
commited on
Commit
·
b2c9100
1
Parent(s):
f0018f2
Added 'date' to metadata
Browse files- code/modules/data_loader.py +2 -1
- code/modules/helpers.py +39 -6
code/modules/data_loader.py
CHANGED
|
@@ -180,7 +180,8 @@ class ChunkProcessor:
|
|
| 180 |
self.document_metadata = []
|
| 181 |
|
| 182 |
lecture_metadata = get_lecture_metadata(
|
| 183 |
-
"https://dl4ds.github.io/sp2024/lectures/"
|
|
|
|
| 184 |
) # TODO: Use more efficiently
|
| 185 |
|
| 186 |
for file_index, file_path in enumerate(uploaded_files):
|
|
|
|
| 180 |
self.document_metadata = []
|
| 181 |
|
| 182 |
lecture_metadata = get_lecture_metadata(
|
| 183 |
+
"https://dl4ds.github.io/sp2024/lectures/",
|
| 184 |
+
"https://dl4ds.github.io/sp2024/schedule/",
|
| 185 |
) # TODO: Use more efficiently
|
| 186 |
|
| 187 |
for file_index, file_path in enumerate(uploaded_files):
|
code/modules/helpers.py
CHANGED
|
@@ -152,6 +152,7 @@ def get_sources(res, answer):
|
|
| 152 |
lecture_tldr = source_metadata.get("tldr", "N/A")
|
| 153 |
lecture_recording = source_metadata.get("lecture_recording", "N/A")
|
| 154 |
suggested_readings = source_metadata.get("suggested_readings", "N/A")
|
|
|
|
| 155 |
|
| 156 |
source_type = source_metadata.get("source_type", "N/A")
|
| 157 |
|
|
@@ -165,6 +166,7 @@ def get_sources(res, answer):
|
|
| 165 |
"lecture_tldr": lecture_tldr,
|
| 166 |
"lecture_recording": lecture_recording,
|
| 167 |
"suggested_readings": suggested_readings,
|
|
|
|
| 168 |
"source_type": source_type,
|
| 169 |
}
|
| 170 |
else:
|
|
@@ -206,6 +208,7 @@ def get_sources(res, answer):
|
|
| 206 |
full_answer += f"\nSource: {source_data['url']}\n"
|
| 207 |
full_answer += f"Page: {source_data['page']}\n"
|
| 208 |
full_answer += f"Type: {source_data['source_type']}\n"
|
|
|
|
| 209 |
full_answer += f"TL;DR: {source_data['lecture_tldr']}\n"
|
| 210 |
full_answer += f"Lecture Recording: {source_data['lecture_recording']}\n"
|
| 211 |
full_answer += f"Suggested Readings: {source_data['suggested_readings']}\n"
|
|
@@ -213,18 +216,42 @@ def get_sources(res, answer):
|
|
| 213 |
return full_answer, source_elements
|
| 214 |
|
| 215 |
|
| 216 |
-
def get_lecture_metadata(schedule_url):
|
| 217 |
"""
|
| 218 |
-
Function to get the lecture metadata from the schedule
|
| 219 |
"""
|
| 220 |
lecture_metadata = {}
|
| 221 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
# Get the main schedule page content
|
| 223 |
-
|
| 224 |
-
|
| 225 |
|
| 226 |
# Find all lecture blocks
|
| 227 |
-
lecture_blocks =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
|
| 229 |
for block in lecture_blocks:
|
| 230 |
try:
|
|
@@ -237,6 +264,9 @@ def get_lecture_metadata(schedule_url):
|
|
| 237 |
# Extract the link to the slides
|
| 238 |
slides_link_tag = block.find("a", title="Download slides")
|
| 239 |
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
# Extract the link to the lecture recording
|
| 242 |
recording_link_tag = block.find("a", title="Download lecture recording")
|
|
@@ -257,9 +287,12 @@ def get_lecture_metadata(schedule_url):
|
|
| 257 |
else:
|
| 258 |
suggested_readings = "No specific readings provided."
|
| 259 |
|
|
|
|
|
|
|
|
|
|
| 260 |
# Add to the dictionary
|
| 261 |
-
slides_link = f"https://dl4ds.github.io{slides_link}"
|
| 262 |
lecture_metadata[slides_link] = {
|
|
|
|
| 263 |
"tldr": tldr,
|
| 264 |
"title": title,
|
| 265 |
"lecture_recording": recording_link,
|
|
|
|
| 152 |
lecture_tldr = source_metadata.get("tldr", "N/A")
|
| 153 |
lecture_recording = source_metadata.get("lecture_recording", "N/A")
|
| 154 |
suggested_readings = source_metadata.get("suggested_readings", "N/A")
|
| 155 |
+
date = source_metadata.get("date", "N/A")
|
| 156 |
|
| 157 |
source_type = source_metadata.get("source_type", "N/A")
|
| 158 |
|
|
|
|
| 166 |
"lecture_tldr": lecture_tldr,
|
| 167 |
"lecture_recording": lecture_recording,
|
| 168 |
"suggested_readings": suggested_readings,
|
| 169 |
+
"date": date,
|
| 170 |
"source_type": source_type,
|
| 171 |
}
|
| 172 |
else:
|
|
|
|
| 208 |
full_answer += f"\nSource: {source_data['url']}\n"
|
| 209 |
full_answer += f"Page: {source_data['page']}\n"
|
| 210 |
full_answer += f"Type: {source_data['source_type']}\n"
|
| 211 |
+
full_answer += f"Date: {source_data['date']}\n"
|
| 212 |
full_answer += f"TL;DR: {source_data['lecture_tldr']}\n"
|
| 213 |
full_answer += f"Lecture Recording: {source_data['lecture_recording']}\n"
|
| 214 |
full_answer += f"Suggested Readings: {source_data['suggested_readings']}\n"
|
|
|
|
| 216 |
return full_answer, source_elements
|
| 217 |
|
| 218 |
|
| 219 |
+
def get_lecture_metadata(lectures_url, schedule_url):
|
| 220 |
"""
|
| 221 |
+
Function to get the lecture metadata from the lectures and schedule URLs.
|
| 222 |
"""
|
| 223 |
lecture_metadata = {}
|
| 224 |
|
| 225 |
+
# Get the main lectures page content
|
| 226 |
+
r_lectures = requests.get(lectures_url)
|
| 227 |
+
soup_lectures = BeautifulSoup(r_lectures.text, "html.parser")
|
| 228 |
+
|
| 229 |
# Get the main schedule page content
|
| 230 |
+
r_schedule = requests.get(schedule_url)
|
| 231 |
+
soup_schedule = BeautifulSoup(r_schedule.text, "html.parser")
|
| 232 |
|
| 233 |
# Find all lecture blocks
|
| 234 |
+
lecture_blocks = soup_lectures.find_all("div", class_="lecture-container")
|
| 235 |
+
|
| 236 |
+
# Create a mapping from slides link to date
|
| 237 |
+
date_mapping = {}
|
| 238 |
+
schedule_rows = soup_schedule.find_all("li", class_="table-row-lecture")
|
| 239 |
+
for row in schedule_rows:
|
| 240 |
+
try:
|
| 241 |
+
date = (
|
| 242 |
+
row.find("div", {"data-label": "Date"}).get_text(separator=" ").strip()
|
| 243 |
+
)
|
| 244 |
+
description_div = row.find("div", {"data-label": "Description"})
|
| 245 |
+
slides_link_tag = description_div.find("a", title="Download slides")
|
| 246 |
+
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
|
| 247 |
+
slides_link = (
|
| 248 |
+
f"https://dl4ds.github.io{slides_link}" if slides_link else None
|
| 249 |
+
)
|
| 250 |
+
if slides_link:
|
| 251 |
+
date_mapping[slides_link] = date
|
| 252 |
+
except Exception as e:
|
| 253 |
+
print(f"Error processing schedule row: {e}")
|
| 254 |
+
continue
|
| 255 |
|
| 256 |
for block in lecture_blocks:
|
| 257 |
try:
|
|
|
|
| 264 |
# Extract the link to the slides
|
| 265 |
slides_link_tag = block.find("a", title="Download slides")
|
| 266 |
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
|
| 267 |
+
slides_link = (
|
| 268 |
+
f"https://dl4ds.github.io{slides_link}" if slides_link else None
|
| 269 |
+
)
|
| 270 |
|
| 271 |
# Extract the link to the lecture recording
|
| 272 |
recording_link_tag = block.find("a", title="Download lecture recording")
|
|
|
|
| 287 |
else:
|
| 288 |
suggested_readings = "No specific readings provided."
|
| 289 |
|
| 290 |
+
# Get the date from the schedule
|
| 291 |
+
date = date_mapping.get(slides_link, "No date available")
|
| 292 |
+
|
| 293 |
# Add to the dictionary
|
|
|
|
| 294 |
lecture_metadata[slides_link] = {
|
| 295 |
+
"date": date,
|
| 296 |
"tldr": tldr,
|
| 297 |
"title": title,
|
| 298 |
"lecture_recording": recording_link,
|