from Bio import Entrez from bs4 import BeautifulSoup import re def remove_latex(text): """ Remove LaTeX from text using regex. """ cleaned_text = re.sub(r'\\.+?{.*?}', '', text) cleaned_text = re.sub(r'[{}\\]', '', cleaned_text) cleaned_text = re.sub(r'\b(ABSTRACT|METHODS)\b', '', cleaned_text, flags=re.IGNORECASE) # Improved regex return cleaned_text def clean_text(text): """ Clean and prepare text for output. """ text = remove_latex(text) text = text.replace('\n', ' ').strip() # Replace newlines with spaces return ' '.join(text.split()) # Removes excessive whitespace Entrez.email = "polymathykhan@gmail.com" # legally required def search_pmc_articles(query, max_papers): handle = Entrez.esearch(db="pmc", term=query, retmax=max_papers) record = Entrez.read(handle) handle.close() return record["IdList"] def fetch_full_text(paper_id): try: handle = Entrez.efetch(db="pmc", id=paper_id, rettype="xml", retmode="xml") xml_data = handle.read() handle.close() return xml_data except Exception as e: print(f"Failed to fetch data for paper ID {paper_id}: {str(e)}") return None # Fetch paper IDs query = "molecular biology[Title] AND open access[Filter]" max_papers = 1000 paper_ids = search_pmc_articles(query, max_papers*20) # Open a file to write with open('papers_data_newbig.txt', 'w', encoding='utf-8') as file: current_papers = 0 for paper_id in paper_ids: if current_papers >= max_papers: break xml_full_text = fetch_full_text(paper_id) if not xml_full_text: continue soup = BeautifulSoup(xml_full_text, 'xml') abstract_text = soup.find('abstract') method_section = soup.find('sec', {'sec-type': 'methods'}) if abstract_text and method_section: abstract_content = clean_text(abstract_text.get_text()) method_content = clean_text(method_section.get_text()) file.write( f"Here is an example past experiment's abstract which tells you amongst other things the main results of what was studied, what methods do you think were used for it? {abstract_content}\n") file.write( f"{method_content}\n\n") current_papers += 1