filter duplicate cids
This commit is contained in:
parent
e9af92c00f
commit
0d2412813f
2 changed files with 16 additions and 7 deletions
|
|
@ -15,6 +15,8 @@ const sleep = (timeout: number) => new Promise((r) => setTimeout(r, timeout));
|
|||
export class DirectoryTailer {
|
||||
public abort = new AbortController();
|
||||
|
||||
lastBatchCIDs = new Set<string>();
|
||||
|
||||
latestDate: string | undefined;
|
||||
saveRaw: boolean = true; // set to false in production so you don't double-store plc data
|
||||
|
||||
|
|
@ -72,13 +74,19 @@ export class DirectoryTailer {
|
|||
|
||||
let entry: ExportEntry | undefined;
|
||||
const promises = [];
|
||||
const cids = new Set<string>();
|
||||
for (const line of new IterLines(text)) {
|
||||
entry = JSON.parse(line) as unknown as ExportEntry;
|
||||
if (this.lastBatchCIDs.has(entry.cid)) continue;
|
||||
|
||||
this.latestDate = entry.createdAt;
|
||||
cids.add(entry.cid);
|
||||
promises.push(this.processRecord(entry, line));
|
||||
}
|
||||
await Promise.all(promises);
|
||||
|
||||
this.lastBatchCIDs = cids;
|
||||
|
||||
if (entry) {
|
||||
this.latestDate = entry.createdAt;
|
||||
const write = Deno.writeTextFile("./data/latest-date", this.latestDate);
|
||||
|
|
|
|||
13
scrape.ts
13
scrape.ts
|
|
@ -17,14 +17,15 @@ export const catchUp = async () => {
|
|||
lineReader.releaseLock();
|
||||
}
|
||||
|
||||
let lastLine: string | undefined;
|
||||
tailer.lastBatchCIDs.clear();
|
||||
for await (const line of lineStream.values()) {
|
||||
lastLine = line;
|
||||
}
|
||||
|
||||
if (lastLine) {
|
||||
const entry = JSON.parse(lastLine) as unknown as ExportEntry;
|
||||
try {
|
||||
const entry = JSON.parse(line) as unknown as ExportEntry;
|
||||
tailer.latestDate = entry.createdAt;
|
||||
tailer.lastBatchCIDs.add(entry.cid);
|
||||
} catch (_err) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue