from PyPDF2 import PdfReader
import nltk
nltk.download('punkt')
# Extracting Text from PDF
def extract_text_from_pdf(file_path):
with open(file_path, 'rb') as file:
pdf = PdfReader(file)
text = " ".join(page.extract_text() for page in pdf.pages)
return text
# Extract text from the PDF and split it into sentences
text = extract_text_from_pdf(file_path)
sample = text[1015:3037]
print(sample)
"""
=======
Output:
=======
Brazil is the world's fifth-largest country by area and the seventh most popul ous. Its capital
is Brasília, and its most popul ous city is São Paulo. The federation is composed of the union of the 26
states and the Federal District. It is the only country in the Americas to have Portugue se as an official
langua ge.[11][12] It is one of the most multicultural and ethnically diverse nations, due to over a century of
mass immigration from around t he world,[13] and the most popul ous Roman Catholic-majority country.
Bounde d by the Atlantic Ocean on the east, Brazil has a coastline of 7,491 kilometers (4,655 mi).[14] It
borders all other countries and territories in South America except Ecuador and Chile and covers roughl y
half of the continent's land area.[15] Its Amazon basin includes a vast tropical forest, home to diverse
wildlife, a variety of ecological systems, and extensive natural resources spanning numerous protected
habitats.[14] This unique environmental heritage positions Brazil at number one of 17 megadiverse
countries, and is the subject of significant global interest, as environmental degradation through processes
like deforestation has direct impacts on gl obal issues like climate change and biodiversity loss.
The territory which would become know n as Brazil was inhabited by numerous tribal nations prior to the
landing in 1500 of explorer Pedro Álvares Cabral, who claimed the discovered land for the Portugue se
Empire. Brazil remained a Portugue se colony until 1808 when the capital of the empire was transferred
from Lisbon to Rio de Janeiro. In 1815, the colony was elevated to the rank of kingdom upon the
formation of the United Kingdom of Portugal, Brazil and the Algarves. Independence was achieved in
1822 with the creation of the Empire of Brazil, a unitary state gove rned unde r a constitutional monarchy
and a parliamentary system. The ratification of the first constitution in 1824 led to the formation of a
bicameral legislature, now called the National Congress.
"""
import nltk
nltk.download('punkt')
# Splitting Text into Sentences
def split_text_into_sentences(text):
sentences = nltk.sent_tokenize(text)
return sentences
sentences = split_text_into_sentences(text)
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
sentences = list(doc.sents)
# Initialize the text splitter with custom parameters
custom_text_splitter = RecursiveCharacterTextSplitter(
# Set custom chunk size
chunk_size = 100,
chunk_overlap = 20,
# Use length of the text as the size measure
length_function = len,
)
# Create the chunks
texts = custom_text_splitter.create_documents([sample])
# Print the first two chunks
print(f'### Chunk 1: \n\n{texts[0].page_content}\n\n=====\n')
print(f'### Chunk 2: \n\n{texts[1].page_content}\n\n=====')
"""
=======
Output:
=======
### Chunk 1:
Brazil is the world's fifth-largest country by area and the seventh most popul ous. Its capital
=====
### Chunk 2:
is Brasília, and its most popul ous city is São Paulo. The federation is composed of the union of
=====
"""
# Initialize the text splitter with custom parameters
custom_text_splitter = RecursiveCharacterTextSplitter(
# Set custom chunk size
chunk_size = 300,
chunk_overlap = 30,
# Use length of the text as the size measure
length_function = len,
# Use only "\n\n" as the separator
separators = ['\n']
)
# Create the chunks
custom_texts = custom_text_splitter.create_documents([sample])
# Print the first two chunks
print(f'### Chunk 1: \n\n{custom_texts[0].page_content}\n\n=====\n')
print(f'### Chunk 2: \n\n{custom_texts[1].page_content}\n\n=====')
# Print the sampled chunks
print("==== Sample chunks from 'Standard Parameters': ====\n\n")
for i, chunk in enumerate(texts):
if i < 4:
print(f"### Chunk {i+1}: \n{chunk.page_content}\n")
print("==== Sample chunks from 'Custom Parameters': ====\n\n")
for i, chunk in enumerate(custom_texts):
if i < 4:
print(f"### Chunk {i+1}: \n{chunk.page_content}\n")
"""
=======
Output:
=======
==== Sample chunks from 'Standard Parameters': ====
### Chunk 1:
Brazil is the world's fifth-largest country by area and the seventh most popul ous. Its capital
### Chunk 2:
is Brasília, and its most popul ous city is São Paulo. The federation is composed of the union of
### Chunk 3:
of the union of the 26
### Chunk 4:
states and the Federal District. It is the only country in the Americas to have Portugue se as an
==== Sample chunks from 'Custom Parameters': ====
### Chunk 1:
Brazil is the world's fifth-largest country by area and the seventh most popul ous. Its capital
is Brasília, and its most popul ous city is São Paulo. The federation is composed of the union of the 26
### Chunk 2:
states and the Federal District. It is the only country in the Americas to have Portugue se as an official
langua ge.[11][12] It is one of the most multicultural and ethnically diverse nations, due to over a century of
### Chunk 3:
mass immigration from around t he world,[13] and the most popul ous Roman Catholic-majority country.
Bounde d by the Atlantic Ocean on the east, Brazil has a coastline of 7,491 kilometers (4,655 mi).[14] It
### Chunk 4:
borders all other countries and territories in South America except Ecuador and Chile and covers roughl y
half of the continent's land area.[15] Its Amazon basin includes a vast tropical forest, home to diverse
"""
图 1:具有不同参数的 Langchain splitter 的块长度分布图
Langchain 文本分词器与NLTK和space
图 2:具有自定义参数的 Langchain Splitter 与 NLTK 和 Spacy 产生的块长度分布图
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
# Load the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Define a list of sentences (your text data)
sentences = ["This is an example sentence.", "Another sentence goes here.", "..."]
# Generate embeddings for the sentences
embeddings = model.encode(sentences)
# Choose an appropriate number of clusters (here we choose 5 as an example)
num_clusters = 3
# Perform K-means clustering
kmeans = KMeans(n_clusters=num_clusters)
clusters = kmeans.fit_predict(embeddings)
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
nltk.download('stopwords')
# Define a list of stop words
stop_words = set(stopwords.words('english'))
# Define a function to clean sentences
def clean_sentence(sentence):
# Tokenize the sentence
tokens = word_tokenize(sentence)
# Convert to lower case
tokens = [w.lower() for w in tokens]
# Remove punctuation
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
# Remove non-alphabetic tokens
words = [word for word in stripped if word.isalpha()]
# Filter out stop words
words = [w for w in words if not w in stop_words]
return words
# Compute and print Word Clouds for each cluster
for i in range(num_clusters):
cluster_sentences = [sentences[j] for j in range(len(sentences)) if clusters[j] == i]
cleaned_sentences = [' '.join(clean_sentence(s)) for s in cluster_sentences]
text = ' '.join(cleaned_sentences)
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title(f"Cluster {i}")
plt.show()
图 3:KMeans 聚类的词云图 — 聚类 0
图 4:KMeans 聚类的词云图 — 聚类 1
图 5:KMeans 聚类的词云图 — 聚类 2
import numpy as np
import spacy
# Load the Spacy model
nlp = spacy.load('en_core_web_sm')
def process(text):
doc = nlp(text)
sents = list(doc.sents)
vecs = np.stack([sent.vector / sent.vector_norm for sent in sents])
return sents, vecs
def cluster_text(sents, vecs, threshold):
clusters = [[0]]
for i in range(1, len(sents)):
if np.dot(vecs[i], vecs[i-1]) < threshold:
clusters.append([])
clusters[-1].append(i)
return clusters
def clean_text(text):
# Add your text cleaning process here
return text
# Initialize the clusters lengths list and final texts list
clusters_lens = []
final_texts = []
# Process the chunk
threshold = 0.3
sents, vecs = process(text)
# Cluster the sentences
clusters = cluster_text(sents, vecs, threshold)
for cluster in clusters:
cluster_txt = clean_text(' '.join([sents[i].text for i in cluster]))
cluster_len = len(cluster_txt)
# Check if the cluster is too short
if cluster_len < 60:
continue
# Check if the cluster is too long
elif cluster_len > 3000:
threshold = 0.6
sents_div, vecs_div = process(cluster_txt)
reclusters = cluster_text(sents_div, vecs_div, threshold)
for subcluster in reclusters:
div_txt = clean_text(' '.join([sents_div[i].text for i in subcluster]))
div_len = len(div_txt)
if div_len < 60 or div_len > 3000:
continue
clusters_lens.append(div_len)
final_texts.append(div_txt)
else:
clusters_lens.append(cluster_len)
final_texts.append(cluster_txt)
==== Sample chunks from 'Langchain Splitter with Custom Parameters': ====
### Chunk 1:
Brazil is the world's fifth-largest country by area and the seventh most popul ous. Its capital
is Brasília, and its most popul ous city is São Paulo. The federation is composed of the union of the 26
### Chunk 2:
states and the Federal District. It is the only country in the Americas to have Portugue se as an official
langua ge.[11][12] It is one of the most multicultural and ethnically diverse nations, due to over a century of
==== Sample chunks from 'Adjacent Sentences Clustering': ====
### Chunk 1:
Brazil is the world's fifth-largest country by area and the seventh most popul ous. Its capital
is Brasília, and its most popul ous city is São Paulo.
### Chunk 2:
The federation is composed of the union of the 26
states and the Federal District. It is the only country in the Americas to have Portugue se as an official
langua ge.[11][12]
final_texts_lengths = [len(chunk) for chunk in final_texts]len(chunk) for chunk in final_texts]
图 6:所有不同测试方法得出的块长度分布图
Langchain 文本分词器
图 7:显示为菠萝剪切的不同文本分块方法