|
from Bio import Entrez |
|
from bs4 import BeautifulSoup |
|
import re |
|
|
|
|
|
def remove_latex(text): |
|
""" Remove LaTeX from text using regex. """ |
|
cleaned_text = re.sub(r'\\.+?{.*?}', '', text) |
|
cleaned_text = re.sub(r'[{}\\]', '', cleaned_text) |
|
cleaned_text = re.sub(r'\b(ABSTRACT|METHODS)\b', '', cleaned_text, flags=re.IGNORECASE) |
|
return cleaned_text |
|
|
|
|
|
def clean_text(text): |
|
""" Clean and prepare text for output. """ |
|
text = remove_latex(text) |
|
text = text.replace('\n', ' ').strip() |
|
return ' '.join(text.split()) |
|
|
|
|
|
Entrez.email = "[email protected]" |
|
|
|
|
|
def search_pmc_articles(query, max_papers): |
|
handle = Entrez.esearch(db="pmc", term=query, retmax=max_papers) |
|
record = Entrez.read(handle) |
|
handle.close() |
|
return record["IdList"] |
|
|
|
|
|
def fetch_full_text(paper_id): |
|
try: |
|
handle = Entrez.efetch(db="pmc", id=paper_id, rettype="xml", retmode="xml") |
|
xml_data = handle.read() |
|
handle.close() |
|
return xml_data |
|
except Exception as e: |
|
print(f"Failed to fetch data for paper ID {paper_id}: {str(e)}") |
|
return None |
|
|
|
|
|
|
|
query = "molecular biology[Title] AND open access[Filter]" |
|
max_papers = 1000 |
|
paper_ids = search_pmc_articles(query, max_papers*20) |
|
|
|
|
|
with open('papers_data_newbig.txt', 'w', encoding='utf-8') as file: |
|
current_papers = 0 |
|
for paper_id in paper_ids: |
|
if current_papers >= max_papers: |
|
break |
|
xml_full_text = fetch_full_text(paper_id) |
|
if not xml_full_text: |
|
continue |
|
|
|
soup = BeautifulSoup(xml_full_text, 'xml') |
|
|
|
abstract_text = soup.find('abstract') |
|
method_section = soup.find('sec', {'sec-type': 'methods'}) |
|
|
|
if abstract_text and method_section: |
|
abstract_content = clean_text(abstract_text.get_text()) |
|
method_content = clean_text(method_section.get_text()) |
|
|
|
file.write( |
|
f"Here is an example past experiment's abstract which tells you amongst other things the main results of what was studied, what methods do you think were used for it? {abstract_content}\n") |
|
file.write( |
|
f"{method_content}\n\n") |
|
|
|
current_papers += 1 |
|
|