File size: 2,336 Bytes
b5f1696
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from Bio import Entrez
from bs4 import BeautifulSoup
import re


def remove_latex(text):
    """ Remove LaTeX from text using regex. """
    cleaned_text = re.sub(r'\\.+?{.*?}', '', text)
    cleaned_text = re.sub(r'[{}\\]', '', cleaned_text)
    cleaned_text = re.sub(r'\b(ABSTRACT|METHODS)\b', '', cleaned_text, flags=re.IGNORECASE)  # Improved regex
    return cleaned_text


def clean_text(text):
    """ Clean and prepare text for output. """
    text = remove_latex(text)
    text = text.replace('\n', ' ').strip()  # Replace newlines with spaces
    return ' '.join(text.split())  # Removes excessive whitespace


Entrez.email = "[email protected]"  # legally required


def search_pmc_articles(query, max_papers):
    handle = Entrez.esearch(db="pmc", term=query, retmax=max_papers)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]


def fetch_full_text(paper_id):
    try:
        handle = Entrez.efetch(db="pmc", id=paper_id, rettype="xml", retmode="xml")
        xml_data = handle.read()
        handle.close()
        return xml_data
    except Exception as e:
        print(f"Failed to fetch data for paper ID {paper_id}: {str(e)}")
        return None


# Fetch paper IDs
query = "molecular biology[Title] AND open access[Filter]"
max_papers = 1000
paper_ids = search_pmc_articles(query, max_papers*20)

# Open a file to write
with open('papers_data_newbig.txt', 'w', encoding='utf-8') as file:
    current_papers = 0
    for paper_id in paper_ids:
        if current_papers >= max_papers:
            break
        xml_full_text = fetch_full_text(paper_id)
        if not xml_full_text:
            continue

        soup = BeautifulSoup(xml_full_text, 'xml')

        abstract_text = soup.find('abstract')
        method_section = soup.find('sec', {'sec-type': 'methods'})

        if abstract_text and method_section:
            abstract_content = clean_text(abstract_text.get_text())
            method_content = clean_text(method_section.get_text())

            file.write(
                f"Here is an example past experiment's abstract which tells you amongst other things the main results of what was studied, what methods do you think were used for it? {abstract_content}\n")
            file.write(
                f"{method_content}\n\n")

            current_papers += 1