from whoosh import index, qparser
from whoosh.fields import *
# Define the schema for the index
schema = Schema(title=TEXT(stored=True),
author=TEXT(stored=True),
category=KEYWORD(stored=True),
content=TEXT)
# Create the index
ix = index.create_in("indexdir", schema)
# Open the index for writing
writer = ix.writer()
# Add documents to the index
writer.add_document(title="Document 1",
author="Author 1",
category="Category 1",
content="This is the content of document 1")
writer.add_document(title="Document 2",
author="Author 2",
category="Category 2",
content="This is the content of document 2")
writer.add_document(title="Document 3",
author="Author 3",
category="Category 1",
content="This is the content of document 3")
# Commit the changes
writer.commit()
# Open the index for reading
searcher = ix.searcher()
# Parse the query
parser = qparser.QueryParser("content", ix.schema)
query = parser.parse("document")
# Perform the search and return the results
results = searcher.search(query)
# Iterate through the results and print them
for result in results:
print(result)
# Faceted search:
facet = searcher.facet_by_fieldname("category")
for category in facet:
print(f"Category: {category}")
print(f"Number of documents: {facet[category]}")
Things about ML, DL, Data Science, AI
Saturday, December 17, 2022
Facets in Whoosh - an example !
Friday, December 16, 2022
How to add documents to the existing index in Whoosh?
To add documents to an existing index in Whoosh, you will need to follow these steps:
1. First, you will need to open the index using the whoosh.index.open_dir function. This function takes the directory where the index is stored as an argument and returns an Index object:
from whoosh import index
# Open the index
ix = index.open_dir("indexdir")
2. Next, you will need to create a whoosh.writing.IndexWriter object using the Index.writer method. The IndexWriter object allows you to add documents to the index:
# Open an index writer
writer = ix.writer()
3. Now you can use the IndexWriter.add_document method to add documents to the index. The add_document method takes a dictionary of fields and values as an argument. The keys of the dictionary should match the field names in the index's schema, and the values should be the field values:
# Add a document to the index
writer.add_document(
title="My Document",
body="This is the body of my document.",
date="2022-01-01",
)
4. After you have added all the documents you want to add, you will need to call the IndexWriter.commit method to save the changes to the index:
# Commit the changes
writer.commit()
Thursday, July 8, 2021
Comparing two dimensionally different dataframes based on one mutual column
import pandas as pd
def compareDF(df1,df2,ColumnName, diff=''):
leftList=[]
rightList=[]
for i in list(df1):
leftList.append(str(i)+'_l')
for i in list(df2):
rightList.append(str(i)+'_r')
df1.columns=leftList
df1=df1.rename(columns={str(ColumnName)+"_l": ColumnName})
df2.columns=rightList
df2=df2.rename(columns={str(ColumnName)+"_r": ColumnName})
df = pd.merge(df1,df2, on=ColumnName, suffixes=('', '') ,how='outer')
df=df.fillna(0)
df.sort_values(ColumnName, inplace=True)
dropList=[]
for index, row in df.iterrows():
total=0
for i in list(df):
if i==ColumnName:
continue
else:
total=row[i]+total
if total==0:
dropList.append(index)
df=df.drop(dropList)
df=df.reset_index(drop=True)
#if diff=='x':
return df
def formatX(c): # Optional
c=str(c)
c=c.replace(',', '')
c=c.replace('$','')
return float(str(c))
def cleanDF(df, columnName):
df=df.fillna(0)
for i in list(df):
try:
if i != columnName:
df[i] = df.apply(lambda x: formatX(x[i]),axis=1)
except Exception as e:
continue
return df
def getDiff(df, columnName):
leftList=[]
rightList=[]
for i in list(df):
if str(i)[-2:]=='_l':
leftList.append(str(i)[:-2])
elif str(i)[-2:]=='_r':
rightList.append(str(i)[:-2])
for i in leftList:
if i not in rightList:
df[str(i+'_r')]=0
for i in rightList:
if i not in leftList:
df[str(i+'_l')]=0
for i in list(df):
try:
if i != columnName and str(i)[-2:]!='_r':
if str(i)[:-2]+'_r' in list(df):
df[str(i)[:-2]]=df[str(i)[:-2]+'_r']-df[i]
else:
print("not there",str(i)[:-2]+'_r', list(df))
elif '_r' in str(i):
pass
#df=df.drop([i], axis=1)
except Exception as e:
#print("Error:", str(e))
continue
for i in list(df):
if str(i)[-2:]=='_l' or str(i)[-2:]=='_r':
df=df.drop([i], axis=1)
return df
df1 = pd.DataFrame(columns=["id","A", "B"], data=[[1,5,3], [2,3,5]])
df2 = pd.DataFrame(columns=["id","A", "B"], data=[[1,2,3], [2,3,6]])
df1=cleanDF(df1, 'id')
df2=cleanDF(df2, 'id')
result=compareDF(df1,df2,'id')
result.head()
id | A_l | B_l | A_r | B_r | |
---|---|---|---|---|---|
0 | 1 | 5.0 | 3.0 | 2.0 | 3.0 |
1 | 2 | 3.0 | 5.0 | 3.0 | 6.0 |
# Right - Left
result2=getDiff(result, 'id')
not there _r ['id', 'A_l', 'B_l', 'A_r', 'B_r', 'A', 'B'] not there _r ['id', 'A_l', 'B_l', 'A_r', 'B_r', 'A', 'B']
result2.head()
id | A | B | |
---|---|---|---|
0 | 1 | -3.0 | 0.0 |
1 | 2 | 0.0 | 1.0 |
df1 = pd.DataFrame(columns=["id","A", "B"], data=[[1,5,3], [2,3,5]])
df2 = pd.DataFrame(columns=["id","A", "B", "C"], data=[[1,2,3,4], [2,3,6,2], [12,13,16,1]])
df1=cleanDF(df1, 'id')
df2=cleanDF(df2, 'id')
result=compareDF(df2,df1,'id')
result.head()
id | A_l | B_l | C_l | A_r | B_r | |
---|---|---|---|---|---|---|
0 | 1 | 2.0 | 3.0 | 4.0 | 5.0 | 3.0 |
1 | 2 | 3.0 | 6.0 | 2.0 | 3.0 | 5.0 |
2 | 12 | 13.0 | 16.0 | 1.0 | 0.0 | 0.0 |
result2=getDiff(result, 'id')
result2.head()
id | A | B | C | |
---|---|---|---|---|
0 | 1 | 3.0 | 0.0 | -4.0 |
1 | 2 | 0.0 | -1.0 | -2.0 |
2 | 12 | -13.0 | -16.0 | -1.0 |